diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp index 8c0360354ef..c89ef347b81 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp @@ -155,6 +155,7 @@ REGISTER_FACTORY(v3, EmbeddingBagOffsetsSum); REGISTER_FACTORY(v3, EmbeddingBagPackedSum); REGISTER_FACTORY(v3, EmbeddingSegmentsSum); REGISTER_FACTORY(v3, ExtractImagePatches); +REGISTER_FACTORY(v3, NonZero); REGISTER_FACTORY(v3, ROIAlign); REGISTER_FACTORY(v3, ScatterUpdate); REGISTER_FACTORY(v3, ScatterElementsUpdate); diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/non_zero.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/non_zero.hpp new file mode 100644 index 00000000000..febdfc142d8 --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/non_zero.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "primitive.hpp" + +namespace cldnn { + +struct count_nonzero : public primitive_base { + CLDNN_DECLARE_PRIMITIVE(count_nonzero) + + /// @brief Constructs count_nonzero primitive. + /// @param id This primitive id. + /// @param data Input data primitive id. + count_nonzero(const primitive_id& id, + const primitive_id& data, + const primitive_id& ext_prim_id = "", + const padding& output_padding = padding()) + : primitive_base(id, {data}, ext_prim_id, output_padding) {} +}; + +struct gather_nonzero : public primitive_base { + CLDNN_DECLARE_PRIMITIVE(gather_nonzero) + + /// @brief Constructs gather_nonzero primitive. + /// @param id This primitive id. + /// @param data Input data primitive id. + /// @param output_shape Output shape [rank of data, number of nonzero elements] + gather_nonzero(const primitive_id& id, + const primitive_id& data, + const primitive_id& output_shape, + const primitive_id& ext_prim_id = "", + const padding& output_padding = padding()) + : primitive_base(id, {data, output_shape}, ext_prim_id, output_padding) {} +}; + +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp new file mode 100644 index 00000000000..7fd4e00fbcb --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp @@ -0,0 +1,119 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "non_zero_inst.h" +#include "primitive_base.hpp" +#include "impls/implementation_map.hpp" +#include "kernel_selector_helper.h" +#include "non_zero/count_nonzero_kernel_ref.h" +#include "non_zero/count_nonzero_kernel_selector.h" +#include "non_zero/gather_nonzero_kernel_ref.h" +#include "non_zero/gather_nonzero_kernel_selector.h" +#include "intel_gpu/runtime/error_handler.hpp" + +using namespace cldnn; + +namespace cldnn { +namespace ocl { + +struct count_nonzero_impl : typed_primitive_impl_ocl { + using parent = typed_primitive_impl_ocl; + using parent::parent; + + std::unique_ptr clone() const override { + return make_unique(*this); + } + + static primitive_impl* create(const count_nonzero_node& arg, const kernel_impl_params& impl_param) { + auto nonzero_params = get_default_params(impl_param); + auto nonzero_optional_params = + get_default_optional_params(arg.get_program()); + + auto& kernel_selector = kernel_selector::count_nonzero_kernel_selector::Instance(); + auto best_kernels = kernel_selector.GetBestKernels(nonzero_params, nonzero_optional_params); + + OPENVINO_ASSERT(!best_kernels.empty(), "Cannot find a proper kernel for ", arg.id()); + + auto count_nonzero = new count_nonzero_impl(arg, best_kernels[0]); + + return count_nonzero; + } +}; + +struct gather_nonzero_impl : typed_primitive_impl_ocl { + using parent = typed_primitive_impl_ocl; + using parent::parent; + + std::unique_ptr clone() const override { + return make_unique(*this); + } + +public: + static primitive_impl* create(const gather_nonzero_node& arg, const kernel_impl_params& impl_param) { + auto nonzero_params = get_default_params(impl_param); + auto nonzero_optional_params = + get_default_optional_params(arg.get_program()); + + nonzero_params.inputs.push_back(convert_data_tensor(arg.input(1).get_output_layout())); + + auto& kernel_selector = kernel_selector::gather_nonzero_kernel_selector::Instance(); + auto best_kernels = kernel_selector.GetBestKernels(nonzero_params, nonzero_optional_params); + + OPENVINO_ASSERT(!best_kernels.empty(), "Cannot find a proper kernel for ", arg.id()); + + auto gather_nonzero = new gather_nonzero_impl(arg, best_kernels[0]); + + return gather_nonzero; + } +}; + +namespace detail { + +attach_count_nonzero_impl::attach_count_nonzero_impl() { + implementation_map::add(impl_types::ocl, count_nonzero_impl::create, { + std::make_tuple(data_types::f32, format::bfyx), + std::make_tuple(data_types::f16, format::bfyx), + std::make_tuple(data_types::i32, format::bfyx), + std::make_tuple(data_types::i8, format::bfyx), + std::make_tuple(data_types::u8, format::bfyx), + + std::make_tuple(data_types::f32, format::bfzyx), + std::make_tuple(data_types::f16, format::bfzyx), + std::make_tuple(data_types::i32, format::bfzyx), + std::make_tuple(data_types::i8, format::bfzyx), + std::make_tuple(data_types::u8, format::bfzyx), + + std::make_tuple(data_types::f32, format::bfwzyx), + std::make_tuple(data_types::f16, format::bfwzyx), + std::make_tuple(data_types::i32, format::bfwzyx), + std::make_tuple(data_types::i8, format::bfwzyx), + std::make_tuple(data_types::u8, format::bfwzyx), + }); +} + +attach_gather_nonzero_impl::attach_gather_nonzero_impl() { + implementation_map::add(impl_types::ocl, gather_nonzero_impl::create, { + std::make_tuple(data_types::f32, format::bfyx), + std::make_tuple(data_types::f16, format::bfyx), + std::make_tuple(data_types::i32, format::bfyx), + std::make_tuple(data_types::i8, format::bfyx), + std::make_tuple(data_types::u8, format::bfyx), + + std::make_tuple(data_types::f32, format::bfzyx), + std::make_tuple(data_types::f16, format::bfzyx), + std::make_tuple(data_types::i32, format::bfzyx), + std::make_tuple(data_types::i8, format::bfzyx), + std::make_tuple(data_types::u8, format::bfzyx), + + std::make_tuple(data_types::f32, format::bfwzyx), + std::make_tuple(data_types::f16, format::bfwzyx), + std::make_tuple(data_types::i32, format::bfwzyx), + std::make_tuple(data_types::i8, format::bfwzyx), + std::make_tuple(data_types::u8, format::bfwzyx), + }); +} + +} // namespace detail +} // namespace ocl +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp index 9dd25effb85..3194e3d3abf 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp @@ -90,6 +90,8 @@ void register_implementations() { REGISTER_OCL(embedding_bag); REGISTER_OCL(extract_image_patches); REGISTER_OCL(convert_color); + REGISTER_OCL(count_nonzero); + REGISTER_OCL(gather_nonzero); } } // namespace ocl diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp index cad3e500f13..1069f0f80f3 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp @@ -73,6 +73,7 @@ #include "intel_gpu/primitives/space_to_batch.hpp" #include "intel_gpu/primitives/strided_slice.hpp" #include "intel_gpu/primitives/tile.hpp" +#include "intel_gpu/primitives/non_zero.hpp" namespace cldnn { namespace ocl { @@ -166,6 +167,8 @@ REGISTER_OCL(cum_sum); REGISTER_OCL(embedding_bag); REGISTER_OCL(extract_image_patches); REGISTER_OCL(convert_color); +REGISTER_OCL(count_nonzero); +REGISTER_OCL(gather_nonzero); #undef REGISTER_OCL diff --git a/src/plugins/intel_gpu/src/graph/include/non_zero_inst.h b/src/plugins/intel_gpu/src/graph/include/non_zero_inst.h new file mode 100644 index 00000000000..326c044dd83 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/include/non_zero_inst.h @@ -0,0 +1,81 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once +#include "intel_gpu/primitives/non_zero.hpp" +#include "primitive_inst.h" +#include "intel_gpu/runtime/error_handler.hpp" + +#include +#include + +namespace cldnn { + +template <> +struct typed_program_node : public typed_program_node_base { + using parent = typed_program_node_base; + typed_program_node(const std::shared_ptr prim, program& prog) : parent(prim, prog) { + support_padding_all(true); + } +public: + using parent::parent; + + program_node& input() const { + OPENVINO_ASSERT(dependencies.size() == 1, "[GPU] Primitive ", id(), " has invalid number of depndencies"); + return get_dependency(0); + } +}; + +using count_nonzero_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base { + using parent = typed_primitive_inst_base; + +public: + static layout calc_output_layout(count_nonzero_node const& node, kernel_impl_params const& impl_param); + static std::string to_string(count_nonzero_node const& node); + + typed_primitive_inst(network& network, count_nonzero_node const& node); + +private: + void on_execute() override; +}; + +using count_nonzero_inst = typed_primitive_inst; + +template <> +struct typed_program_node : public typed_program_node_base { + using parent = typed_program_node_base; + typed_program_node(const std::shared_ptr prim, program& prog) : parent(prim, prog) { + support_padding_all(true); + } + +public: + using parent::parent; + + bool generates_dynamic_output() const override { return true; } + + program_node& input(size_t index = 0) const { + OPENVINO_ASSERT(dependencies.size() == 2, "[GPU] Primitive ", id(), " has invalid number of depndencies"); + return get_dependency(index); + } +}; + +using gather_nonzero_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base { + using parent = typed_primitive_inst_base; + +public: + static layout calc_output_layout(gather_nonzero_node const& node, kernel_impl_params const& impl_param); + static std::string to_string(gather_nonzero_node const& node); + + typed_primitive_inst(network& network, gather_nonzero_node const& node); +}; + +using gather_nonzero_inst = typed_primitive_inst; +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/non_zero.cpp b/src/plugins/intel_gpu/src/graph/non_zero.cpp new file mode 100644 index 00000000000..471bef25228 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/non_zero.cpp @@ -0,0 +1,93 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include "non_zero_inst.h" +#include "primitive_type_base.h" +#include "intel_gpu/runtime/memory.hpp" +#include "intel_gpu/runtime/error_handler.hpp" +#include "intel_gpu/runtime/debug_configuration.hpp" +#include "json_object.h" +#include + +namespace cldnn { + +// ----------------------------------------------- +// count_nonzero +// ----------------------------------------------- +primitive_type_id count_nonzero::type_id() { + static primitive_type_base instance; + return &instance; +} + +layout count_nonzero_inst::calc_output_layout(count_nonzero_node const& node, kernel_impl_params const& impl_param) { + assert(static_cast(node.get_primitive()->output_data_type) == false && + "Output data type forcing is not supported for count_nonzero_node!"); + return layout{cldnn::data_types::i32, cldnn::format::bfyx, tensor{1, 1, 1, 4}}; +} + +std::string count_nonzero_inst::to_string(count_nonzero_node const& node) { + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + auto& input = node.input(); + + std::stringstream primitive_description; + + json_composite count_nonzero_info; + count_nonzero_info.add("input id", input.id()); + count_nonzero_info.add("output shape", tensor{1, 1, 1, 4}); + + node_info->add("count_nonzero info", count_nonzero_info); + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +count_nonzero_inst::typed_primitive_inst(network& network, count_nonzero_node const& node) : parent(network, node) {} + +void count_nonzero_inst::on_execute() { + output_memory().fill(_network.get_stream(), 0); +} + +// ----------------------------------------------- +// gather_nonzero +// ----------------------------------------------- +primitive_type_id gather_nonzero::type_id() { + static primitive_type_base instance; + return &instance; +} + +layout gather_nonzero_inst::calc_output_layout(gather_nonzero_node const& node, kernel_impl_params const& impl_param) { + assert(static_cast(node.get_primitive()->output_data_type) == false && + "Output data type forcing is not supported for gather_nonzero_node!"); + if (impl_param.memory_deps.count(1)) { + auto out_size = read_vector(impl_param.memory_deps.at(1), impl_param.prog.get_stream()); + ov::Shape output_shape(out_size.begin(), out_size.end()); + ov::PartialShape output_pshape(output_shape); + return layout{output_pshape, cldnn::data_types::i32, cldnn::format::bfyx}; + } else { + return layout{ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), 1, 1}), cldnn::data_types::i32, cldnn::format::bfyx}; + } +} + +std::string gather_nonzero_inst::to_string(gather_nonzero_node const& node) { + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + auto& input = node.input(); + + std::stringstream primitive_description; + + json_composite gather_nonzero_info; + gather_nonzero_info.add("input id", input.id()); + gather_nonzero_info.add("output layout", node.get_output_layout().to_string()); + + node_info->add("gather_nonzero info", gather_nonzero_info); + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +gather_nonzero_inst::typed_primitive_inst(network& network, gather_nonzero_node const& node) : parent(network, node, false) {} + +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/kernel_selector/common/common_types.h b/src/plugins/intel_gpu/src/kernel_selector/common/common_types.h index 221933f67ee..e0322b5648c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/common/common_types.h +++ b/src/plugins/intel_gpu/src/kernel_selector/common/common_types.h @@ -32,6 +32,8 @@ enum class KernelType { SCALE, REORDER, RESHAPE, + COUNT_NONZERO, + GATHER_NONZERO, PERMUTE, CONCATENATION, RESAMPLE, diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_ref.cpp new file mode 100644 index 00000000000..3e299d5b300 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_ref.cpp @@ -0,0 +1,92 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "count_nonzero_kernel_ref.h" +#include "kernel_selector_utils.h" +#include + +namespace kernel_selector { +ParamsKey CountNonzeroKernelRef::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableInputDataType(Datatype::INT8); + k.EnableInputDataType(Datatype::UINT8); + k.EnableInputDataType(Datatype::INT32); + k.EnableInputDataType(Datatype::UINT32); + k.EnableInputDataType(Datatype::INT64); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::INT32); + k.EnableOutputDataType(Datatype::UINT32); + k.EnableOutputDataType(Datatype::INT64); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDifferentTypes(); + return k; +} + +KernelsData CountNonzeroKernelRef::GetKernelsData(const Params& params, const optional_params& options) const { + assert(params.GetType() == KernelType::COUNT_NONZERO); + + KernelData kd = KernelData::Default(params); + count_nonzero_params& newParams = *static_cast(kd.params.get()); + + auto entry_point = GetEntryPoint(kernelName, newParams.layerID, params, options); + auto cldnn_jit = MakeBaseParamsJitConstants(newParams); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + + const auto& in = newParams.inputs[0]; + auto& kernel = kd.kernels[0]; + const auto& in_dims = in.GetDims(); + + std::vector> dims_by_gws; + + if (in_dims.size() == 4) { + kernel.params.workGroups.global = {in_dims[0].v, in_dims[1].v, in_dims[2].v * in_dims[3].v}; + dims_by_gws = {{Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; + } else if (in_dims.size() == 5) { + kernel.params.workGroups.global = {in_dims[0].v, in_dims[1].v * in_dims[2].v, in_dims[3].v * in_dims[4].v}; + dims_by_gws = {{Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y, Tensor::DataChannelName::Z}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; + } else { + kernel.params.workGroups.global = {in_dims[0].v * in_dims[1].v, in_dims[2].v * in_dims[3].v, in_dims[4].v * in_dims[5].v}; + dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y}, + {Tensor::DataChannelName::Z, Tensor::DataChannelName::W}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; + } + + kernel.params.workGroups.local = GetOptimalLocalWorkGroupSizes(kernel.params.workGroups.global, + params.engineInfo, + newParams.inputs[0].GetLayout(), + newParams.outputs[0].GetLayout(), + dims_by_gws); + + kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT); + kernel.params.arguments = GetArgsDesc(1, false, false); + + return {kd}; +} + +KernelsPriority CountNonzeroKernelRef::GetKernelsPriority(const Params& /*params*/, const optional_params& /*options*/) const { + return DONT_USE_IF_HAVE_SOMETHING_ELSE; +} + +bool CountNonzeroKernelRef::Validate(const Params& p, const optional_params& op) const { + if (!KernelBaseOpenCL::Validate(p, op)) + return false; + + const auto& rp = static_cast(p); + + return Tensor::SimpleLayout(rp.inputs[0].GetLayout()); +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_ref.h new file mode 100644 index 00000000000..e6c937c41e7 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_ref.h @@ -0,0 +1,36 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_base_opencl.h" + +namespace kernel_selector { +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// count_nonzero_params +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +struct count_nonzero_params : public base_params { + count_nonzero_params() : base_params(KernelType::COUNT_NONZERO) {} +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// count_nonzero_optional_params +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +struct count_nonzero_optional_params : optional_params { + count_nonzero_optional_params() : optional_params(KernelType::COUNT_NONZERO) {} +}; + +class CountNonzeroKernelRef : public KernelBaseOpenCL { +public: + CountNonzeroKernelRef() : KernelBaseOpenCL("count_nonzero_ref") {} + virtual ~CountNonzeroKernelRef() {} + + KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override; + ParamsKey GetSupportedKey() const override; + +protected: + bool Validate(const Params& p, const optional_params& op) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_selector.cpp new file mode 100644 index 00000000000..e2142299741 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_selector.cpp @@ -0,0 +1,15 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "count_nonzero_kernel_selector.h" +#include "count_nonzero_kernel_ref.h" + +namespace kernel_selector { + +count_nonzero_kernel_selector::count_nonzero_kernel_selector() { Attach(); } + +KernelsData count_nonzero_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const { + return GetNaiveBestKernel(params, options, KernelType::COUNT_NONZERO); +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_selector.h b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_selector.h new file mode 100644 index 00000000000..83e4836b45a --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_selector.h @@ -0,0 +1,23 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector { +class count_nonzero_kernel_selector : public kernel_selector_base { +public: + static count_nonzero_kernel_selector& Instance() { + static count_nonzero_kernel_selector instance_; + return instance_; + } + + count_nonzero_kernel_selector(); + + virtual ~count_nonzero_kernel_selector() {} + + KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_ref.cpp new file mode 100644 index 00000000000..9bdfae9bc79 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_ref.cpp @@ -0,0 +1,96 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "gather_nonzero_kernel_ref.h" +#include "kernel_selector_utils.h" +#include + +namespace kernel_selector { +ParamsKey GatherNonzeroKernelRef::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableInputDataType(Datatype::INT8); + k.EnableInputDataType(Datatype::UINT8); + k.EnableInputDataType(Datatype::INT32); + k.EnableInputDataType(Datatype::UINT32); + k.EnableInputDataType(Datatype::INT64); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::INT32); + k.EnableOutputDataType(Datatype::UINT32); + k.EnableOutputDataType(Datatype::INT64); + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfyx); + k.EnableInputLayout(DataLayout::bfzyx); + k.EnableOutputLayout(DataLayout::bfzyx); + k.EnableInputLayout(DataLayout::bfwzyx); + k.EnableOutputLayout(DataLayout::bfwzyx); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDifferentTypes(); + return k; +} + +KernelsData GatherNonzeroKernelRef::GetKernelsData(const Params& params, const optional_params& options) const { + assert(params.GetType() == KernelType::GATHER_NONZERO); + + KernelData kd = KernelData::Default(params); + gather_nonzero_params& newParams = *static_cast(kd.params.get()); + + auto entry_point = GetEntryPoint(kernelName, newParams.layerID, params, options); + auto cldnn_jit = MakeBaseParamsJitConstants(newParams); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + + const auto& in = newParams.inputs[0]; + auto& kernel = kd.kernels[0]; + const auto& in_dims = in.GetDims(); + + std::vector> dims_by_gws; + + if (in_dims.size() == 4) { + kernel.params.workGroups.global = {in_dims[0].v, in_dims[1].v, in_dims[2].v * in_dims[3].v}; + dims_by_gws = {{Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; + } else if (in_dims.size() == 5) { + kernel.params.workGroups.global = {in_dims[0].v, in_dims[1].v * in_dims[2].v, in_dims[3].v * in_dims[4].v}; + dims_by_gws = {{Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y, Tensor::DataChannelName::Z}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; + } else { + kernel.params.workGroups.global = {in_dims[0].v * in_dims[1].v, in_dims[2].v * in_dims[3].v, in_dims[4].v * in_dims[5].v}; + dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y}, + {Tensor::DataChannelName::Z, Tensor::DataChannelName::W}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; + } + + kernel.params.workGroups.local = GetOptimalLocalWorkGroupSizes(kernel.params.workGroups.global, + params.engineInfo, + newParams.inputs[0].GetLayout(), + newParams.outputs[0].GetLayout(), + dims_by_gws); + + kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT); + kernel.params.arguments = GetArgsDesc(2, false, false); + + return {kd}; +} + +KernelsPriority GatherNonzeroKernelRef::GetKernelsPriority(const Params& /*params*/, const optional_params& /*options*/) const { + return DONT_USE_IF_HAVE_SOMETHING_ELSE; +} + +bool GatherNonzeroKernelRef::Validate(const Params& p, const optional_params& op) const { + if (!KernelBaseOpenCL::Validate(p, op)) + return false; + + const auto& rp = static_cast(p); + + return Tensor::SimpleLayout(rp.inputs[0].GetLayout()); +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_ref.h new file mode 100644 index 00000000000..c0a3cf0c355 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_ref.h @@ -0,0 +1,36 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_base_opencl.h" + +namespace kernel_selector { +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// gather_nonzero_params +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +struct gather_nonzero_params : public base_params { + gather_nonzero_params() : base_params(KernelType::GATHER_NONZERO) {} +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// gather_nonzero_optional_params +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +struct gather_nonzero_optional_params : optional_params { + gather_nonzero_optional_params() : optional_params(KernelType::GATHER_NONZERO) {} +}; + +class GatherNonzeroKernelRef : public KernelBaseOpenCL { +public: + GatherNonzeroKernelRef() : KernelBaseOpenCL("gather_nonzero_ref") {} + virtual ~GatherNonzeroKernelRef() {} + + KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override; + ParamsKey GetSupportedKey() const override; + +protected: + bool Validate(const Params& p, const optional_params& op) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_selector.cpp new file mode 100644 index 00000000000..30168b303ed --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_selector.cpp @@ -0,0 +1,15 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "gather_nonzero_kernel_selector.h" +#include "gather_nonzero_kernel_ref.h" + +namespace kernel_selector { + +gather_nonzero_kernel_selector::gather_nonzero_kernel_selector() { Attach(); } + +KernelsData gather_nonzero_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const { + return GetNaiveBestKernel(params, options, KernelType::GATHER_NONZERO); +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_selector.h b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_selector.h new file mode 100644 index 00000000000..c2f875ed993 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_selector.h @@ -0,0 +1,23 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector { +class gather_nonzero_kernel_selector : public kernel_selector_base { +public: + static gather_nonzero_kernel_selector& Instance() { + static gather_nonzero_kernel_selector instance_; + return instance_; + } + + gather_nonzero_kernel_selector(); + + virtual ~gather_nonzero_kernel_selector() {} + + KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/count_nonzero_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/count_nonzero_ref.cl new file mode 100644 index 00000000000..43e535c4c5b --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/count_nonzero_ref.cl @@ -0,0 +1,51 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/common.cl" +#include "include/batch_headers/data_types.cl" + +#define INPUT0_GET_INDEX1(idx_order) INPUT0_GET_INDEX(idx_order) + +KERNEL (count_nonzero_ref)(const __global INPUT0_TYPE* input, + volatile __global OUTPUT_TYPE* output) +{ + const uint gdim0 = (uint)get_global_id(0); + const uint gdim1 = (uint)get_global_id(1); + const uint gdim2 = (uint)get_global_id(2); + + #if INPUT0_DIMS == 6 + #define INPUT_ORDER b,f,w,z,y,x + const uint x = gdim0 % INPUT0_SIZE_X; + const uint y = gdim0 / INPUT0_SIZE_X; + const uint z = gdim1 % INPUT0_SIZE_Z; + const uint w = gdim1 / INPUT0_SIZE_Z; + #elif INPUT0_DIMS == 5 + #define INPUT_ORDER b,f,z,y,x + const uint x = gdim0; + const uint y = gdim1 % INPUT0_SIZE_Y; + const uint z = gdim1 / INPUT0_SIZE_Y; + #elif INPUT0_DIMS == 4 + #define INPUT_ORDER b,f,y,x + const uint x = gdim0; + const uint y = gdim1; + #endif + + const uint f = gdim2 % INPUT0_FEATURE_NUM; + const uint b = gdim2 / INPUT0_FEATURE_NUM; + + uint count = (input[INPUT0_GET_INDEX1(INPUT_ORDER)] == INPUT0_VAL_ZERO) ? 0 : 1; + count = sub_group_reduce_add(count); + + if (get_sub_group_local_id() == 0) + atomic_add(&(output[1]), count); + + if (gdim0 == 0 && gdim1 == 0 && gdim2 == 0) { + output[0] = INPUT0_DIMS; + output[2] = 1; + output[3] = 1; + } +} + +#undef INPUT0_GET_INDEX1 +#undef INPUT_ORDER diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/gather_nonzero_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/gather_nonzero_ref.cl new file mode 100644 index 00000000000..b106d7d20d8 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/gather_nonzero_ref.cl @@ -0,0 +1,87 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/common.cl" +#include "include/batch_headers/data_types.cl" + +#define INPUT0_GET_INDEX1(idx_order) INPUT0_GET_INDEX(idx_order) + +KERNEL (gather_nonzero_ref)(const __global INPUT0_TYPE* input, + volatile __global INPUT1_TYPE* output_shape, + __global OUTPUT_TYPE* output) +{ + const uint gdim0 = (uint)get_global_id(0); + const uint gdim1 = (uint)get_global_id(1); + const uint gdim2 = (uint)get_global_id(2); + + #if INPUT0_DIMS == 6 + #define INPUT_ORDER b,f,w,z,y,x + const uint x = gdim0 % INPUT0_SIZE_X; + const uint y = gdim0 / INPUT0_SIZE_X; + const uint z = gdim1 % INPUT0_SIZE_Z; + const uint w = gdim1 / INPUT0_SIZE_Z; + #elif INPUT0_DIMS == 5 + #define INPUT_ORDER b,f,z,y,x + const uint x = gdim0; + const uint y = gdim1 % INPUT0_SIZE_Y; + const uint z = gdim1 / INPUT0_SIZE_Y; + #elif INPUT0_DIMS == 4 + #define INPUT_ORDER b,f,y,x + const uint x = gdim0; + const uint y = gdim1; + #endif + + const uint f = gdim2 % INPUT0_FEATURE_NUM; + const uint b = gdim2 / INPUT0_FEATURE_NUM; + + int num_nonzero_acc = (input[INPUT0_GET_INDEX1(INPUT_ORDER)] == INPUT0_VAL_ZERO) ? 0 : 1; + num_nonzero_acc = sub_group_scan_inclusive_add(num_nonzero_acc); + + int pos; + + if (get_sub_group_local_id() == (get_sub_group_size() - 1)) { + pos = atomic_add(&(output_shape[2]), num_nonzero_acc); + pos = pos - 1; + } + + pos = sub_group_broadcast(pos, (get_sub_group_size() - 1)); + + // output_shape = {rank, # nonzero, 1, 1} + if (input[INPUT0_GET_INDEX1(INPUT_ORDER)] != INPUT0_VAL_ZERO) { + const int num_nonzero = output_shape[1]; + + pos = pos + num_nonzero_acc - 1; + + int output_b = pos; + int output_f = pos + num_nonzero; + + output[output_b] = b; + output[output_f] = f; + + #if INPUT0_DIMS == 6 + int output_w = pos + num_nonzero * 2; + int output_z = pos + num_nonzero * 3; + int output_y = pos + num_nonzero * 4; + int output_x = pos + num_nonzero * 5; + + output[output_w] = w; + output[output_z] = z; + #elif INPUT0_DIMS == 5 + int output_z = pos + num_nonzero * 2; + int output_y = pos + num_nonzero * 3; + int output_x = pos + num_nonzero * 4; + + output[output_z] = z; + #elif INPUT0_DIMS == 4 + int output_y = pos + num_nonzero * 2; + int output_x = pos + num_nonzero * 3; + #endif + + output[output_y] = y; + output[output_x] = x; + } +} + +#undef INPUT0_GET_INDEX1 +#undef INPUT_ORDER diff --git a/src/plugins/intel_gpu/src/plugin/ops/non_zero.cpp b/src/plugins/intel_gpu/src/plugin/ops/non_zero.cpp new file mode 100644 index 00000000000..1c9841a4d85 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/ops/non_zero.cpp @@ -0,0 +1,38 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "intel_gpu/plugin/program.hpp" +#include "intel_gpu/plugin/common_utils.hpp" + +#include "ngraph/op/non_zero.hpp" + +#include "intel_gpu/primitives/non_zero.hpp" + +namespace ov { +namespace intel_gpu { + +static void CreateNonZeroOp(Program& p, const std::shared_ptr& op) { + p.ValidateInputs(op, {1}); + auto input_primitives = p.GetInputPrimitiveIDs(op); + std::string layer_name = layer_type_name_ID(op); + + cldnn::primitive_id count_prim_id = layer_name + "_count"; + auto count_prim = cldnn::count_nonzero(count_prim_id, + input_primitives[0], + op->get_friendly_name()); + + auto gather_prim = cldnn::gather_nonzero(layer_name, + input_primitives[0], + count_prim_id, + op->get_friendly_name()); + + p.AddPrimitive(count_prim); + p.AddPrimitive(gather_prim); + p.AddPrimitiveToProfiler(op); +} + +REGISTER_FACTORY_IMPL(v3, NonZero); + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/tests/test_cases/non_zero_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/non_zero_gpu_test.cpp new file mode 100644 index 00000000000..9d30dae049a --- /dev/null +++ b/src/plugins/intel_gpu/tests/test_cases/non_zero_gpu_test.cpp @@ -0,0 +1,342 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" + +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +inline void do_count_non_zero_test(engine& engine, + const cldnn::memory::ptr& input_data, + const std::vector& expected_results) +{ + topology topology; + topology.add(input_layout("InputData", input_data->get_layout())); + topology.add( + count_nonzero("count_nonzero", "InputData") + ); + + network network(engine, topology); + + network.set_input_data("InputData", input_data); + auto outputs = network.execute(); + auto output = outputs.at("count_nonzero").get_memory(); + cldnn::mem_lock output_ptr(output, get_test_stream()); + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + +inline void do_gather_non_zero_test(engine& engine, + const cldnn::memory::ptr& input_data, + const cldnn::memory::ptr& output_shape, + const std::vector& expected_results) +{ + topology topology; + topology.add(input_layout("InputData", input_data->get_layout())); + topology.add(input_layout("OutputShape", output_shape->get_layout())); + topology.add( + gather_nonzero("gather_nonzero", "InputData", "OutputShape") + ); + + network network(engine, topology); + + network.set_input_data("InputData", input_data); + network.set_input_data("OutputShape", output_shape); + auto outputs = network.execute(); + auto output = outputs.at("gather_nonzero").get_memory(); + cldnn::mem_lock output_ptr(output, get_test_stream()); + cldnn::mem_lock shape_ptr(output_shape, get_test_stream()); + + int num_ranks = shape_ptr[0]; + int num_nonzero = shape_ptr[1]; + + for (int i = 0; i < num_nonzero; i++) { + bool found = false; + for (int j = 0; j < num_nonzero; j++) { + for (int k = 0; k < num_ranks; k++) { + if (output_ptr[i+num_nonzero*k] != expected_results[j+num_nonzero*k]) + break; + + if (k == (num_ranks - 1)) { + found = true; + } + } + if (found) + break; + } + + EXPECT_TRUE(found); + } +} + +inline void do_non_zero_test(engine& engine, + const cldnn::memory::ptr& input_data, + const std::vector& expected_shape, + const std::vector& expected_results) +{ + topology topology; + topology.add(input_layout("InputData", input_data->get_layout())); + topology.add( + count_nonzero("count_nonzero", "InputData") + ); + topology.add( + gather_nonzero("gather_nonzero", "InputData", "count_nonzero") + ); + + network network(engine, topology); + + network.set_input_data("InputData", input_data); + auto outputs = network.execute(); + auto output = outputs.at("gather_nonzero").get_memory(); + cldnn::mem_lock output_ptr(output, get_test_stream()); + std::vector output_list = std::vector(output_ptr.begin(), output_ptr.end()); + + int num_ranks = expected_shape[0]; + int num_nonzero = expected_shape[1]; + + EXPECT_EQ(num_ranks*num_nonzero, output_list.size()); + + for (int i = 0; i < num_nonzero; i++) { + bool found = false; + for (int j = 0; j < num_nonzero; j++) { + for (int k = 0; k < num_ranks; k++) { + if (output_list[i+num_nonzero*k] != expected_results[j+num_nonzero*k]) + break; + + if (k == (num_ranks - 1)) { + found = true; + } + } + if (found) + break; + } + + EXPECT_TRUE(found); + } +} + +TEST(count_nonzero_gpu_fp16, test1) { + auto& engine = get_test_engine(); + + auto input = engine.allocate_memory({ data_types::f16, format::bfyx, tensor{ 1, 3, 3, 1 } }); + + set_values(input, { + FLOAT16(0), FLOAT16(1), FLOAT16(8), + FLOAT16(5), FLOAT16(5), FLOAT16(2), + FLOAT16(7), FLOAT16(10), FLOAT16(4), + }); + + std::vector expected_results = { + 4, 8, 1, 1, + }; + + do_count_non_zero_test(engine, input, expected_results); +} + +TEST(gather_nonzero_gpu_fp16, test1) { + auto& engine = get_test_engine(); + + auto input = engine.allocate_memory({ data_types::f16, format::bfyx, tensor{ 1, 3, 3, 1 } }); + auto output_shape = engine.allocate_memory({ data_types::i32, format::bfyx, tensor{ 1, 1, 4, 1 } }); + + set_values(input, { + FLOAT16(0), FLOAT16(1), FLOAT16(8), + FLOAT16(5), FLOAT16(5), FLOAT16(2), + FLOAT16(7), FLOAT16(10), FLOAT16(4), + }); + + set_values(output_shape, { + 4, 8, 1, 1, + }); + + std::vector expected_results = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 1, 1, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 2, 0, 1, 2, 0, 1, 2, + }; + + do_gather_non_zero_test(engine, input, output_shape, expected_results); +} + +TEST(gather_nonzero_gpu_fp16, test2) { + auto& engine = get_test_engine(); + + auto input = engine.allocate_memory({ data_types::f16, format::bfyx, tensor{ 1, 3, 3, 1 } }); + auto output_shape = engine.allocate_memory({ data_types::i32, format::bfyx, tensor{ 1, 1, 4, 1 } }); + + set_values(input, { + FLOAT16(0), FLOAT16(1), FLOAT16(8), + FLOAT16(5), FLOAT16(5), FLOAT16(0), + FLOAT16(7), FLOAT16(0), FLOAT16(4), + }); + + set_values(output_shape, { + 4, 6, 1, 1, + }); + + std::vector expected_results = { + 0, 0, 0, 0, 0, 0, + 0, 0, 1, 1, 2, 2, + 0, 0, 0, 0, 0, 0, + 1, 2, 0, 1, 0, 2, + }; + + do_gather_non_zero_test(engine, input, output_shape, expected_results); +} + +TEST(nonzero_gpu_fp16, test1) { + auto& engine = get_test_engine(); + + auto input = engine.allocate_memory({ data_types::f16, format::bfyx, tensor{ 1, 3, 3, 1 } }); + + set_values(input, { + FLOAT16(0), FLOAT16(1), FLOAT16(8), + FLOAT16(5), FLOAT16(5), FLOAT16(0), + FLOAT16(7), FLOAT16(0), FLOAT16(4), + }); + + std::vector expected_shape = { + 4, 6, 1, 1, + }; + + std::vector expected_results = { + 0, 0, 0, 0, 0, 0, + 0, 0, 1, 1, 2, 2, + 0, 0, 0, 0, 0, 0, + 1, 2, 0, 1, 0, 2, + }; + + do_non_zero_test(engine, input, expected_shape, expected_results); +} + +TEST(nonzero_gpu_fp16, test2) { + auto& engine = get_test_engine(); + + auto input = engine.allocate_memory({ data_types::f16, format::bfzyx, tensor{ 1, 3, 3, 3, 1 } }); + + set_values(input, { + FLOAT16(0), FLOAT16(1), FLOAT16(8), + FLOAT16(7), FLOAT16(10), FLOAT16(4), + FLOAT16(7), FLOAT16(0), FLOAT16(4), + FLOAT16(9), FLOAT16(5), FLOAT16(1), + FLOAT16(2), FLOAT16(0), FLOAT16(8), + FLOAT16(2), FLOAT16(10), FLOAT16(7), + FLOAT16(2), FLOAT16(4), FLOAT16(8), + FLOAT16(5), FLOAT16(9), FLOAT16(10), + FLOAT16(10), FLOAT16(5), FLOAT16(2), + }); + + std::vector expected_shape = { + 5, 24, 1, 1, + }; + + std::vector expected_results = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 1, 1, 2, 2, 0, 0, 0, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, + 1, 2, 0, 1, 2, 0, 2, 0, 1, 2, 0, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, + }; + + do_non_zero_test(engine, input, expected_shape, expected_results); +} + +TEST(nonzero_gpu_fp16, test3) { + auto& engine = get_test_engine(); + + auto input = engine.allocate_memory({ data_types::f16, format::bfwzyx, tensor{ 1, 3, 3, 3, 3, 1 } }); + + set_values(input, { + FLOAT16(0), FLOAT16(1), FLOAT16(8), FLOAT16(7), FLOAT16(10), FLOAT16(4), FLOAT16(6), FLOAT16(5), FLOAT16(4), + FLOAT16(7), FLOAT16(0), FLOAT16(4), FLOAT16(9), FLOAT16(5), FLOAT16(1), FLOAT16(2), FLOAT16(2), FLOAT16(0), + FLOAT16(2), FLOAT16(0), FLOAT16(8), FLOAT16(2), FLOAT16(10), FLOAT16(7), FLOAT16(7), FLOAT16(0), FLOAT16(6), + FLOAT16(2), FLOAT16(4), FLOAT16(8), FLOAT16(5), FLOAT16(9), FLOAT16(10), FLOAT16(10), FLOAT16(5), FLOAT16(2), + FLOAT16(4), FLOAT16(8), FLOAT16(2), FLOAT16(1), FLOAT16(4), FLOAT16(10), FLOAT16(10), FLOAT16(2), FLOAT16(21), + FLOAT16(0), FLOAT16(1), FLOAT16(5), FLOAT16(1), FLOAT16(5), FLOAT16(1), FLOAT16(9), FLOAT16(4), FLOAT16(22), + FLOAT16(4), FLOAT16(3), FLOAT16(7), FLOAT16(6), FLOAT16(9), FLOAT16(8), FLOAT16(9), FLOAT16(7), FLOAT16(23), + FLOAT16(4), FLOAT16(10), FLOAT16(6), FLOAT16(3), FLOAT16(5), FLOAT16(5), FLOAT16(4), FLOAT16(2), FLOAT16(23), + FLOAT16(0), FLOAT16(4), FLOAT16(5), FLOAT16(3), FLOAT16(1), FLOAT16(2), FLOAT16(8), FLOAT16(5), FLOAT16(0), + }); + + std::vector expected_shape = { + 6, 73, 1, 1, + }; + + std::vector expected_results = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 1, 1, 1, 2, 2, 0, 0, 1, 1, 1, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 1, 1, 1, 2, 2, + 1, 2, 0, 1, 2, 0, 1, 2, 0, 2, 0, 1, 2, 0, 1, 0, 2, 0, 1, 2, 0, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 1, 2, 0, 1, 2, 0, 1, + }; + + do_non_zero_test(engine, input, expected_shape, expected_results); +} + +TEST(nonzero_gpu_fp32, test1) { + auto& engine = get_test_engine(); + + auto input = engine.allocate_memory({ data_types::f32, format::bfyx, tensor{ 2, 3, 5, 4 } }); + + set_values(input, { + 6, 6, 0, 3, 0, 4, 1, 0, 8, 4, 8, 5, 8, 6, 0, 2, 0, 9, 6, 9, + 1, 2, 4, 9, 0, 8, 5, 7, 4, 6, 8, 0, 6, 2, 3, 5, 0, 9, 8, 7, + 3, 6, 5, 3, 8, 4, 7, 5, 7, 8, 5, 2, 1, 8, 9, 2, 1, 4, 3, 3, + + 7, 3, 9, 9, 0, 2, 4, 0, 4, 9, 5, 9, 4, 5, 8, 1, 2, 9, 7, 6, + 7, 9, 6, 7, 2, 9, 2, 7, 8, 3, 1, 2, 7, 4, 6, 2, 3, 7, 0, 5, + 2, 3, 7, 7, 0, 3, 4, 0, 9, 0, 9, 0, 2, 7, 7, 8, 6, 6, 0, 8, + }); + + std::vector expected_shape = { + 4, 104, 1, 1, + }; + + std::vector expected_results = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + + + 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, + 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, + 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, + 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, + 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, + 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, + + + 0, 1, 3, 0, 1, 3, 4, 0, 1, 2, 3, 0, 2, 3, 4, + 0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 2, 3, 4, 0, 2, 3, 4, + 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, + 0, 1, 2, 3, 0, 1, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, + 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 4, + 0, 1, 2, 3, 0, 1, 3, 0, 2, 3, 4, 0, 1, 2, 4, + }; + + do_non_zero_test(engine, input, expected_shape, expected_results); +}