[GPU] NonZero primitive (#12519)

This commit is contained in:
Vladimir Paramuzov 2022-08-15 17:32:37 +04:00 committed by GitHub
parent 03b7c1e69e
commit dd55f434c3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 1194 additions and 0 deletions

View File

@ -155,6 +155,7 @@ REGISTER_FACTORY(v3, EmbeddingBagOffsetsSum);
REGISTER_FACTORY(v3, EmbeddingBagPackedSum);
REGISTER_FACTORY(v3, EmbeddingSegmentsSum);
REGISTER_FACTORY(v3, ExtractImagePatches);
REGISTER_FACTORY(v3, NonZero);
REGISTER_FACTORY(v3, ROIAlign);
REGISTER_FACTORY(v3, ScatterUpdate);
REGISTER_FACTORY(v3, ScatterElementsUpdate);

View File

@ -0,0 +1,39 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "primitive.hpp"
namespace cldnn {
struct count_nonzero : public primitive_base<count_nonzero> {
CLDNN_DECLARE_PRIMITIVE(count_nonzero)
/// @brief Constructs count_nonzero primitive.
/// @param id This primitive id.
/// @param data Input data primitive id.
count_nonzero(const primitive_id& id,
const primitive_id& data,
const primitive_id& ext_prim_id = "",
const padding& output_padding = padding())
: primitive_base(id, {data}, ext_prim_id, output_padding) {}
};
struct gather_nonzero : public primitive_base<gather_nonzero> {
CLDNN_DECLARE_PRIMITIVE(gather_nonzero)
/// @brief Constructs gather_nonzero primitive.
/// @param id This primitive id.
/// @param data Input data primitive id.
/// @param output_shape Output shape [rank of data, number of nonzero elements]
gather_nonzero(const primitive_id& id,
const primitive_id& data,
const primitive_id& output_shape,
const primitive_id& ext_prim_id = "",
const padding& output_padding = padding())
: primitive_base(id, {data, output_shape}, ext_prim_id, output_padding) {}
};
} // namespace cldnn

View File

@ -0,0 +1,119 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "non_zero_inst.h"
#include "primitive_base.hpp"
#include "impls/implementation_map.hpp"
#include "kernel_selector_helper.h"
#include "non_zero/count_nonzero_kernel_ref.h"
#include "non_zero/count_nonzero_kernel_selector.h"
#include "non_zero/gather_nonzero_kernel_ref.h"
#include "non_zero/gather_nonzero_kernel_selector.h"
#include "intel_gpu/runtime/error_handler.hpp"
using namespace cldnn;
namespace cldnn {
namespace ocl {
struct count_nonzero_impl : typed_primitive_impl_ocl<count_nonzero> {
using parent = typed_primitive_impl_ocl<count_nonzero>;
using parent::parent;
std::unique_ptr<primitive_impl> clone() const override {
return make_unique<count_nonzero_impl>(*this);
}
static primitive_impl* create(const count_nonzero_node& arg, const kernel_impl_params& impl_param) {
auto nonzero_params = get_default_params<kernel_selector::count_nonzero_params>(impl_param);
auto nonzero_optional_params =
get_default_optional_params<kernel_selector::count_nonzero_optional_params>(arg.get_program());
auto& kernel_selector = kernel_selector::count_nonzero_kernel_selector::Instance();
auto best_kernels = kernel_selector.GetBestKernels(nonzero_params, nonzero_optional_params);
OPENVINO_ASSERT(!best_kernels.empty(), "Cannot find a proper kernel for ", arg.id());
auto count_nonzero = new count_nonzero_impl(arg, best_kernels[0]);
return count_nonzero;
}
};
struct gather_nonzero_impl : typed_primitive_impl_ocl<gather_nonzero> {
using parent = typed_primitive_impl_ocl<gather_nonzero>;
using parent::parent;
std::unique_ptr<primitive_impl> clone() const override {
return make_unique<gather_nonzero_impl>(*this);
}
public:
static primitive_impl* create(const gather_nonzero_node& arg, const kernel_impl_params& impl_param) {
auto nonzero_params = get_default_params<kernel_selector::gather_nonzero_params>(impl_param);
auto nonzero_optional_params =
get_default_optional_params<kernel_selector::gather_nonzero_optional_params>(arg.get_program());
nonzero_params.inputs.push_back(convert_data_tensor(arg.input(1).get_output_layout()));
auto& kernel_selector = kernel_selector::gather_nonzero_kernel_selector::Instance();
auto best_kernels = kernel_selector.GetBestKernels(nonzero_params, nonzero_optional_params);
OPENVINO_ASSERT(!best_kernels.empty(), "Cannot find a proper kernel for ", arg.id());
auto gather_nonzero = new gather_nonzero_impl(arg, best_kernels[0]);
return gather_nonzero;
}
};
namespace detail {
attach_count_nonzero_impl::attach_count_nonzero_impl() {
implementation_map<count_nonzero>::add(impl_types::ocl, count_nonzero_impl::create, {
std::make_tuple(data_types::f32, format::bfyx),
std::make_tuple(data_types::f16, format::bfyx),
std::make_tuple(data_types::i32, format::bfyx),
std::make_tuple(data_types::i8, format::bfyx),
std::make_tuple(data_types::u8, format::bfyx),
std::make_tuple(data_types::f32, format::bfzyx),
std::make_tuple(data_types::f16, format::bfzyx),
std::make_tuple(data_types::i32, format::bfzyx),
std::make_tuple(data_types::i8, format::bfzyx),
std::make_tuple(data_types::u8, format::bfzyx),
std::make_tuple(data_types::f32, format::bfwzyx),
std::make_tuple(data_types::f16, format::bfwzyx),
std::make_tuple(data_types::i32, format::bfwzyx),
std::make_tuple(data_types::i8, format::bfwzyx),
std::make_tuple(data_types::u8, format::bfwzyx),
});
}
attach_gather_nonzero_impl::attach_gather_nonzero_impl() {
implementation_map<gather_nonzero>::add(impl_types::ocl, gather_nonzero_impl::create, {
std::make_tuple(data_types::f32, format::bfyx),
std::make_tuple(data_types::f16, format::bfyx),
std::make_tuple(data_types::i32, format::bfyx),
std::make_tuple(data_types::i8, format::bfyx),
std::make_tuple(data_types::u8, format::bfyx),
std::make_tuple(data_types::f32, format::bfzyx),
std::make_tuple(data_types::f16, format::bfzyx),
std::make_tuple(data_types::i32, format::bfzyx),
std::make_tuple(data_types::i8, format::bfzyx),
std::make_tuple(data_types::u8, format::bfzyx),
std::make_tuple(data_types::f32, format::bfwzyx),
std::make_tuple(data_types::f16, format::bfwzyx),
std::make_tuple(data_types::i32, format::bfwzyx),
std::make_tuple(data_types::i8, format::bfwzyx),
std::make_tuple(data_types::u8, format::bfwzyx),
});
}
} // namespace detail
} // namespace ocl
} // namespace cldnn

View File

@ -90,6 +90,8 @@ void register_implementations() {
REGISTER_OCL(embedding_bag);
REGISTER_OCL(extract_image_patches);
REGISTER_OCL(convert_color);
REGISTER_OCL(count_nonzero);
REGISTER_OCL(gather_nonzero);
}
} // namespace ocl

View File

@ -73,6 +73,7 @@
#include "intel_gpu/primitives/space_to_batch.hpp"
#include "intel_gpu/primitives/strided_slice.hpp"
#include "intel_gpu/primitives/tile.hpp"
#include "intel_gpu/primitives/non_zero.hpp"
namespace cldnn {
namespace ocl {
@ -166,6 +167,8 @@ REGISTER_OCL(cum_sum);
REGISTER_OCL(embedding_bag);
REGISTER_OCL(extract_image_patches);
REGISTER_OCL(convert_color);
REGISTER_OCL(count_nonzero);
REGISTER_OCL(gather_nonzero);
#undef REGISTER_OCL

View File

@ -0,0 +1,81 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
///////////////////////////////////////////////////////////////////////////////////////////////////
#pragma once
#include "intel_gpu/primitives/non_zero.hpp"
#include "primitive_inst.h"
#include "intel_gpu/runtime/error_handler.hpp"
#include <string>
#include <memory>
namespace cldnn {
template <>
struct typed_program_node<count_nonzero> : public typed_program_node_base<count_nonzero> {
using parent = typed_program_node_base<count_nonzero>;
typed_program_node(const std::shared_ptr<count_nonzero> prim, program& prog) : parent(prim, prog) {
support_padding_all(true);
}
public:
using parent::parent;
program_node& input() const {
OPENVINO_ASSERT(dependencies.size() == 1, "[GPU] Primitive ", id(), " has invalid number of depndencies");
return get_dependency(0);
}
};
using count_nonzero_node = typed_program_node<count_nonzero>;
template <>
class typed_primitive_inst<count_nonzero> : public typed_primitive_inst_base<count_nonzero> {
using parent = typed_primitive_inst_base<count_nonzero>;
public:
static layout calc_output_layout(count_nonzero_node const& node, kernel_impl_params const& impl_param);
static std::string to_string(count_nonzero_node const& node);
typed_primitive_inst(network& network, count_nonzero_node const& node);
private:
void on_execute() override;
};
using count_nonzero_inst = typed_primitive_inst<count_nonzero>;
template <>
struct typed_program_node<gather_nonzero> : public typed_program_node_base<gather_nonzero> {
using parent = typed_program_node_base<gather_nonzero>;
typed_program_node(const std::shared_ptr<gather_nonzero> prim, program& prog) : parent(prim, prog) {
support_padding_all(true);
}
public:
using parent::parent;
bool generates_dynamic_output() const override { return true; }
program_node& input(size_t index = 0) const {
OPENVINO_ASSERT(dependencies.size() == 2, "[GPU] Primitive ", id(), " has invalid number of depndencies");
return get_dependency(index);
}
};
using gather_nonzero_node = typed_program_node<gather_nonzero>;
template <>
class typed_primitive_inst<gather_nonzero> : public typed_primitive_inst_base<gather_nonzero> {
using parent = typed_primitive_inst_base<gather_nonzero>;
public:
static layout calc_output_layout(gather_nonzero_node const& node, kernel_impl_params const& impl_param);
static std::string to_string(gather_nonzero_node const& node);
typed_primitive_inst(network& network, gather_nonzero_node const& node);
};
using gather_nonzero_inst = typed_primitive_inst<gather_nonzero>;
} // namespace cldnn

View File

@ -0,0 +1,93 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
///////////////////////////////////////////////////////////////////////////////////////////////////
#include "non_zero_inst.h"
#include "primitive_type_base.h"
#include "intel_gpu/runtime/memory.hpp"
#include "intel_gpu/runtime/error_handler.hpp"
#include "intel_gpu/runtime/debug_configuration.hpp"
#include "json_object.h"
#include <string>
namespace cldnn {
// -----------------------------------------------
// count_nonzero
// -----------------------------------------------
primitive_type_id count_nonzero::type_id() {
static primitive_type_base<count_nonzero> instance;
return &instance;
}
layout count_nonzero_inst::calc_output_layout(count_nonzero_node const& node, kernel_impl_params const& impl_param) {
assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
"Output data type forcing is not supported for count_nonzero_node!");
return layout{cldnn::data_types::i32, cldnn::format::bfyx, tensor{1, 1, 1, 4}};
}
std::string count_nonzero_inst::to_string(count_nonzero_node const& node) {
auto desc = node.get_primitive();
auto node_info = node.desc_to_json();
auto& input = node.input();
std::stringstream primitive_description;
json_composite count_nonzero_info;
count_nonzero_info.add("input id", input.id());
count_nonzero_info.add("output shape", tensor{1, 1, 1, 4});
node_info->add("count_nonzero info", count_nonzero_info);
node_info->dump(primitive_description);
return primitive_description.str();
}
count_nonzero_inst::typed_primitive_inst(network& network, count_nonzero_node const& node) : parent(network, node) {}
void count_nonzero_inst::on_execute() {
output_memory().fill(_network.get_stream(), 0);
}
// -----------------------------------------------
// gather_nonzero
// -----------------------------------------------
primitive_type_id gather_nonzero::type_id() {
static primitive_type_base<gather_nonzero> instance;
return &instance;
}
layout gather_nonzero_inst::calc_output_layout(gather_nonzero_node const& node, kernel_impl_params const& impl_param) {
assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
"Output data type forcing is not supported for gather_nonzero_node!");
if (impl_param.memory_deps.count(1)) {
auto out_size = read_vector<int64_t>(impl_param.memory_deps.at(1), impl_param.prog.get_stream());
ov::Shape output_shape(out_size.begin(), out_size.end());
ov::PartialShape output_pshape(output_shape);
return layout{output_pshape, cldnn::data_types::i32, cldnn::format::bfyx};
} else {
return layout{ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), 1, 1}), cldnn::data_types::i32, cldnn::format::bfyx};
}
}
std::string gather_nonzero_inst::to_string(gather_nonzero_node const& node) {
auto desc = node.get_primitive();
auto node_info = node.desc_to_json();
auto& input = node.input();
std::stringstream primitive_description;
json_composite gather_nonzero_info;
gather_nonzero_info.add("input id", input.id());
gather_nonzero_info.add("output layout", node.get_output_layout().to_string());
node_info->add("gather_nonzero info", gather_nonzero_info);
node_info->dump(primitive_description);
return primitive_description.str();
}
gather_nonzero_inst::typed_primitive_inst(network& network, gather_nonzero_node const& node) : parent(network, node, false) {}
} // namespace cldnn

View File

@ -32,6 +32,8 @@ enum class KernelType {
SCALE,
REORDER,
RESHAPE,
COUNT_NONZERO,
GATHER_NONZERO,
PERMUTE,
CONCATENATION,
RESAMPLE,

View File

@ -0,0 +1,92 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "count_nonzero_kernel_ref.h"
#include "kernel_selector_utils.h"
#include <string>
namespace kernel_selector {
ParamsKey CountNonzeroKernelRef::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::F16);
k.EnableInputDataType(Datatype::F32);
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT8);
k.EnableInputDataType(Datatype::INT32);
k.EnableInputDataType(Datatype::UINT32);
k.EnableInputDataType(Datatype::INT64);
k.EnableOutputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::INT32);
k.EnableOutputDataType(Datatype::UINT32);
k.EnableOutputDataType(Datatype::INT64);
k.EnableAllInputLayout();
k.EnableAllOutputLayout();
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();
k.EnableDifferentTypes();
return k;
}
KernelsData CountNonzeroKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
assert(params.GetType() == KernelType::COUNT_NONZERO);
KernelData kd = KernelData::Default<count_nonzero_params>(params);
count_nonzero_params& newParams = *static_cast<count_nonzero_params*>(kd.params.get());
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, params, options);
auto cldnn_jit = MakeBaseParamsJitConstants(newParams);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
const auto& in = newParams.inputs[0];
auto& kernel = kd.kernels[0];
const auto& in_dims = in.GetDims();
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws;
if (in_dims.size() == 4) {
kernel.params.workGroups.global = {in_dims[0].v, in_dims[1].v, in_dims[2].v * in_dims[3].v};
dims_by_gws = {{Tensor::DataChannelName::X},
{Tensor::DataChannelName::Y},
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
} else if (in_dims.size() == 5) {
kernel.params.workGroups.global = {in_dims[0].v, in_dims[1].v * in_dims[2].v, in_dims[3].v * in_dims[4].v};
dims_by_gws = {{Tensor::DataChannelName::X},
{Tensor::DataChannelName::Y, Tensor::DataChannelName::Z},
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
} else {
kernel.params.workGroups.global = {in_dims[0].v * in_dims[1].v, in_dims[2].v * in_dims[3].v, in_dims[4].v * in_dims[5].v};
dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y},
{Tensor::DataChannelName::Z, Tensor::DataChannelName::W},
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
}
kernel.params.workGroups.local = GetOptimalLocalWorkGroupSizes(kernel.params.workGroups.global,
params.engineInfo,
newParams.inputs[0].GetLayout(),
newParams.outputs[0].GetLayout(),
dims_by_gws);
kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT);
kernel.params.arguments = GetArgsDesc(1, false, false);
return {kd};
}
KernelsPriority CountNonzeroKernelRef::GetKernelsPriority(const Params& /*params*/, const optional_params& /*options*/) const {
return DONT_USE_IF_HAVE_SOMETHING_ELSE;
}
bool CountNonzeroKernelRef::Validate(const Params& p, const optional_params& op) const {
if (!KernelBaseOpenCL::Validate(p, op))
return false;
const auto& rp = static_cast<const count_nonzero_params&>(p);
return Tensor::SimpleLayout(rp.inputs[0].GetLayout());
}
} // namespace kernel_selector

View File

@ -0,0 +1,36 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "kernel_base_opencl.h"
namespace kernel_selector {
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// count_nonzero_params
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
struct count_nonzero_params : public base_params {
count_nonzero_params() : base_params(KernelType::COUNT_NONZERO) {}
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// count_nonzero_optional_params
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
struct count_nonzero_optional_params : optional_params {
count_nonzero_optional_params() : optional_params(KernelType::COUNT_NONZERO) {}
};
class CountNonzeroKernelRef : public KernelBaseOpenCL {
public:
CountNonzeroKernelRef() : KernelBaseOpenCL("count_nonzero_ref") {}
virtual ~CountNonzeroKernelRef() {}
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
protected:
bool Validate(const Params& p, const optional_params& op) const override;
};
} // namespace kernel_selector

View File

@ -0,0 +1,15 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "count_nonzero_kernel_selector.h"
#include "count_nonzero_kernel_ref.h"
namespace kernel_selector {
count_nonzero_kernel_selector::count_nonzero_kernel_selector() { Attach<CountNonzeroKernelRef>(); }
KernelsData count_nonzero_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
return GetNaiveBestKernel(params, options, KernelType::COUNT_NONZERO);
}
} // namespace kernel_selector

View File

@ -0,0 +1,23 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "kernel_selector.h"
namespace kernel_selector {
class count_nonzero_kernel_selector : public kernel_selector_base {
public:
static count_nonzero_kernel_selector& Instance() {
static count_nonzero_kernel_selector instance_;
return instance_;
}
count_nonzero_kernel_selector();
virtual ~count_nonzero_kernel_selector() {}
KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
};
} // namespace kernel_selector

View File

@ -0,0 +1,96 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "gather_nonzero_kernel_ref.h"
#include "kernel_selector_utils.h"
#include <string>
namespace kernel_selector {
ParamsKey GatherNonzeroKernelRef::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::F16);
k.EnableInputDataType(Datatype::F32);
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT8);
k.EnableInputDataType(Datatype::INT32);
k.EnableInputDataType(Datatype::UINT32);
k.EnableInputDataType(Datatype::INT64);
k.EnableOutputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::INT32);
k.EnableOutputDataType(Datatype::UINT32);
k.EnableOutputDataType(Datatype::INT64);
k.EnableInputLayout(DataLayout::bfyx);
k.EnableOutputLayout(DataLayout::bfyx);
k.EnableInputLayout(DataLayout::bfzyx);
k.EnableOutputLayout(DataLayout::bfzyx);
k.EnableInputLayout(DataLayout::bfwzyx);
k.EnableOutputLayout(DataLayout::bfwzyx);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();
k.EnableDifferentTypes();
return k;
}
KernelsData GatherNonzeroKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
assert(params.GetType() == KernelType::GATHER_NONZERO);
KernelData kd = KernelData::Default<gather_nonzero_params>(params);
gather_nonzero_params& newParams = *static_cast<gather_nonzero_params*>(kd.params.get());
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, params, options);
auto cldnn_jit = MakeBaseParamsJitConstants(newParams);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
const auto& in = newParams.inputs[0];
auto& kernel = kd.kernels[0];
const auto& in_dims = in.GetDims();
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws;
if (in_dims.size() == 4) {
kernel.params.workGroups.global = {in_dims[0].v, in_dims[1].v, in_dims[2].v * in_dims[3].v};
dims_by_gws = {{Tensor::DataChannelName::X},
{Tensor::DataChannelName::Y},
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
} else if (in_dims.size() == 5) {
kernel.params.workGroups.global = {in_dims[0].v, in_dims[1].v * in_dims[2].v, in_dims[3].v * in_dims[4].v};
dims_by_gws = {{Tensor::DataChannelName::X},
{Tensor::DataChannelName::Y, Tensor::DataChannelName::Z},
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
} else {
kernel.params.workGroups.global = {in_dims[0].v * in_dims[1].v, in_dims[2].v * in_dims[3].v, in_dims[4].v * in_dims[5].v};
dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y},
{Tensor::DataChannelName::Z, Tensor::DataChannelName::W},
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
}
kernel.params.workGroups.local = GetOptimalLocalWorkGroupSizes(kernel.params.workGroups.global,
params.engineInfo,
newParams.inputs[0].GetLayout(),
newParams.outputs[0].GetLayout(),
dims_by_gws);
kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT);
kernel.params.arguments = GetArgsDesc(2, false, false);
return {kd};
}
KernelsPriority GatherNonzeroKernelRef::GetKernelsPriority(const Params& /*params*/, const optional_params& /*options*/) const {
return DONT_USE_IF_HAVE_SOMETHING_ELSE;
}
bool GatherNonzeroKernelRef::Validate(const Params& p, const optional_params& op) const {
if (!KernelBaseOpenCL::Validate(p, op))
return false;
const auto& rp = static_cast<const gather_nonzero_params&>(p);
return Tensor::SimpleLayout(rp.inputs[0].GetLayout());
}
} // namespace kernel_selector

View File

@ -0,0 +1,36 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "kernel_base_opencl.h"
namespace kernel_selector {
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// gather_nonzero_params
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
struct gather_nonzero_params : public base_params {
gather_nonzero_params() : base_params(KernelType::GATHER_NONZERO) {}
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// gather_nonzero_optional_params
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
struct gather_nonzero_optional_params : optional_params {
gather_nonzero_optional_params() : optional_params(KernelType::GATHER_NONZERO) {}
};
class GatherNonzeroKernelRef : public KernelBaseOpenCL {
public:
GatherNonzeroKernelRef() : KernelBaseOpenCL("gather_nonzero_ref") {}
virtual ~GatherNonzeroKernelRef() {}
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
protected:
bool Validate(const Params& p, const optional_params& op) const override;
};
} // namespace kernel_selector

View File

@ -0,0 +1,15 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "gather_nonzero_kernel_selector.h"
#include "gather_nonzero_kernel_ref.h"
namespace kernel_selector {
gather_nonzero_kernel_selector::gather_nonzero_kernel_selector() { Attach<GatherNonzeroKernelRef>(); }
KernelsData gather_nonzero_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
return GetNaiveBestKernel(params, options, KernelType::GATHER_NONZERO);
}
} // namespace kernel_selector

View File

@ -0,0 +1,23 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "kernel_selector.h"
namespace kernel_selector {
class gather_nonzero_kernel_selector : public kernel_selector_base {
public:
static gather_nonzero_kernel_selector& Instance() {
static gather_nonzero_kernel_selector instance_;
return instance_;
}
gather_nonzero_kernel_selector();
virtual ~gather_nonzero_kernel_selector() {}
KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
};
} // namespace kernel_selector

View File

@ -0,0 +1,51 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/common.cl"
#include "include/batch_headers/data_types.cl"
#define INPUT0_GET_INDEX1(idx_order) INPUT0_GET_INDEX(idx_order)
KERNEL (count_nonzero_ref)(const __global INPUT0_TYPE* input,
volatile __global OUTPUT_TYPE* output)
{
const uint gdim0 = (uint)get_global_id(0);
const uint gdim1 = (uint)get_global_id(1);
const uint gdim2 = (uint)get_global_id(2);
#if INPUT0_DIMS == 6
#define INPUT_ORDER b,f,w,z,y,x
const uint x = gdim0 % INPUT0_SIZE_X;
const uint y = gdim0 / INPUT0_SIZE_X;
const uint z = gdim1 % INPUT0_SIZE_Z;
const uint w = gdim1 / INPUT0_SIZE_Z;
#elif INPUT0_DIMS == 5
#define INPUT_ORDER b,f,z,y,x
const uint x = gdim0;
const uint y = gdim1 % INPUT0_SIZE_Y;
const uint z = gdim1 / INPUT0_SIZE_Y;
#elif INPUT0_DIMS == 4
#define INPUT_ORDER b,f,y,x
const uint x = gdim0;
const uint y = gdim1;
#endif
const uint f = gdim2 % INPUT0_FEATURE_NUM;
const uint b = gdim2 / INPUT0_FEATURE_NUM;
uint count = (input[INPUT0_GET_INDEX1(INPUT_ORDER)] == INPUT0_VAL_ZERO) ? 0 : 1;
count = sub_group_reduce_add(count);
if (get_sub_group_local_id() == 0)
atomic_add(&(output[1]), count);
if (gdim0 == 0 && gdim1 == 0 && gdim2 == 0) {
output[0] = INPUT0_DIMS;
output[2] = 1;
output[3] = 1;
}
}
#undef INPUT0_GET_INDEX1
#undef INPUT_ORDER

View File

@ -0,0 +1,87 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/common.cl"
#include "include/batch_headers/data_types.cl"
#define INPUT0_GET_INDEX1(idx_order) INPUT0_GET_INDEX(idx_order)
KERNEL (gather_nonzero_ref)(const __global INPUT0_TYPE* input,
volatile __global INPUT1_TYPE* output_shape,
__global OUTPUT_TYPE* output)
{
const uint gdim0 = (uint)get_global_id(0);
const uint gdim1 = (uint)get_global_id(1);
const uint gdim2 = (uint)get_global_id(2);
#if INPUT0_DIMS == 6
#define INPUT_ORDER b,f,w,z,y,x
const uint x = gdim0 % INPUT0_SIZE_X;
const uint y = gdim0 / INPUT0_SIZE_X;
const uint z = gdim1 % INPUT0_SIZE_Z;
const uint w = gdim1 / INPUT0_SIZE_Z;
#elif INPUT0_DIMS == 5
#define INPUT_ORDER b,f,z,y,x
const uint x = gdim0;
const uint y = gdim1 % INPUT0_SIZE_Y;
const uint z = gdim1 / INPUT0_SIZE_Y;
#elif INPUT0_DIMS == 4
#define INPUT_ORDER b,f,y,x
const uint x = gdim0;
const uint y = gdim1;
#endif
const uint f = gdim2 % INPUT0_FEATURE_NUM;
const uint b = gdim2 / INPUT0_FEATURE_NUM;
int num_nonzero_acc = (input[INPUT0_GET_INDEX1(INPUT_ORDER)] == INPUT0_VAL_ZERO) ? 0 : 1;
num_nonzero_acc = sub_group_scan_inclusive_add(num_nonzero_acc);
int pos;
if (get_sub_group_local_id() == (get_sub_group_size() - 1)) {
pos = atomic_add(&(output_shape[2]), num_nonzero_acc);
pos = pos - 1;
}
pos = sub_group_broadcast(pos, (get_sub_group_size() - 1));
// output_shape = {rank, # nonzero, 1, 1}
if (input[INPUT0_GET_INDEX1(INPUT_ORDER)] != INPUT0_VAL_ZERO) {
const int num_nonzero = output_shape[1];
pos = pos + num_nonzero_acc - 1;
int output_b = pos;
int output_f = pos + num_nonzero;
output[output_b] = b;
output[output_f] = f;
#if INPUT0_DIMS == 6
int output_w = pos + num_nonzero * 2;
int output_z = pos + num_nonzero * 3;
int output_y = pos + num_nonzero * 4;
int output_x = pos + num_nonzero * 5;
output[output_w] = w;
output[output_z] = z;
#elif INPUT0_DIMS == 5
int output_z = pos + num_nonzero * 2;
int output_y = pos + num_nonzero * 3;
int output_x = pos + num_nonzero * 4;
output[output_z] = z;
#elif INPUT0_DIMS == 4
int output_y = pos + num_nonzero * 2;
int output_x = pos + num_nonzero * 3;
#endif
output[output_y] = y;
output[output_x] = x;
}
}
#undef INPUT0_GET_INDEX1
#undef INPUT_ORDER

View File

@ -0,0 +1,38 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "intel_gpu/plugin/program.hpp"
#include "intel_gpu/plugin/common_utils.hpp"
#include "ngraph/op/non_zero.hpp"
#include "intel_gpu/primitives/non_zero.hpp"
namespace ov {
namespace intel_gpu {
static void CreateNonZeroOp(Program& p, const std::shared_ptr<ngraph::Node>& op) {
p.ValidateInputs(op, {1});
auto input_primitives = p.GetInputPrimitiveIDs(op);
std::string layer_name = layer_type_name_ID(op);
cldnn::primitive_id count_prim_id = layer_name + "_count";
auto count_prim = cldnn::count_nonzero(count_prim_id,
input_primitives[0],
op->get_friendly_name());
auto gather_prim = cldnn::gather_nonzero(layer_name,
input_primitives[0],
count_prim_id,
op->get_friendly_name());
p.AddPrimitive(count_prim);
p.AddPrimitive(gather_prim);
p.AddPrimitiveToProfiler(op);
}
REGISTER_FACTORY_IMPL(v3, NonZero);
} // namespace intel_gpu
} // namespace ov

View File

@ -0,0 +1,342 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "test_utils.h"
#include <intel_gpu/primitives/input_layout.hpp>
#include <intel_gpu/primitives/non_zero.hpp>
#include <intel_gpu/runtime/memory.hpp>
#include <intel_gpu/graph/topology.hpp>
#include <intel_gpu/graph/network.hpp>
#include <cstddef>
using namespace cldnn;
using namespace ::tests;
inline void do_count_non_zero_test(engine& engine,
const cldnn::memory::ptr& input_data,
const std::vector<int32_t>& expected_results)
{
topology topology;
topology.add(input_layout("InputData", input_data->get_layout()));
topology.add(
count_nonzero("count_nonzero", "InputData")
);
network network(engine, topology);
network.set_input_data("InputData", input_data);
auto outputs = network.execute();
auto output = outputs.at("count_nonzero").get_memory();
cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
for (size_t i = 0; i < expected_results.size(); ++i) {
EXPECT_EQ(expected_results[i], output_ptr[i]);
}
}
inline void do_gather_non_zero_test(engine& engine,
const cldnn::memory::ptr& input_data,
const cldnn::memory::ptr& output_shape,
const std::vector<int32_t>& expected_results)
{
topology topology;
topology.add(input_layout("InputData", input_data->get_layout()));
topology.add(input_layout("OutputShape", output_shape->get_layout()));
topology.add(
gather_nonzero("gather_nonzero", "InputData", "OutputShape")
);
network network(engine, topology);
network.set_input_data("InputData", input_data);
network.set_input_data("OutputShape", output_shape);
auto outputs = network.execute();
auto output = outputs.at("gather_nonzero").get_memory();
cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
cldnn::mem_lock<int32_t> shape_ptr(output_shape, get_test_stream());
int num_ranks = shape_ptr[0];
int num_nonzero = shape_ptr[1];
for (int i = 0; i < num_nonzero; i++) {
bool found = false;
for (int j = 0; j < num_nonzero; j++) {
for (int k = 0; k < num_ranks; k++) {
if (output_ptr[i+num_nonzero*k] != expected_results[j+num_nonzero*k])
break;
if (k == (num_ranks - 1)) {
found = true;
}
}
if (found)
break;
}
EXPECT_TRUE(found);
}
}
inline void do_non_zero_test(engine& engine,
const cldnn::memory::ptr& input_data,
const std::vector<int32_t>& expected_shape,
const std::vector<int32_t>& expected_results)
{
topology topology;
topology.add(input_layout("InputData", input_data->get_layout()));
topology.add(
count_nonzero("count_nonzero", "InputData")
);
topology.add(
gather_nonzero("gather_nonzero", "InputData", "count_nonzero")
);
network network(engine, topology);
network.set_input_data("InputData", input_data);
auto outputs = network.execute();
auto output = outputs.at("gather_nonzero").get_memory();
cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
std::vector<int32_t> output_list = std::vector<int32_t>(output_ptr.begin(), output_ptr.end());
int num_ranks = expected_shape[0];
int num_nonzero = expected_shape[1];
EXPECT_EQ(num_ranks*num_nonzero, output_list.size());
for (int i = 0; i < num_nonzero; i++) {
bool found = false;
for (int j = 0; j < num_nonzero; j++) {
for (int k = 0; k < num_ranks; k++) {
if (output_list[i+num_nonzero*k] != expected_results[j+num_nonzero*k])
break;
if (k == (num_ranks - 1)) {
found = true;
}
}
if (found)
break;
}
EXPECT_TRUE(found);
}
}
TEST(count_nonzero_gpu_fp16, test1) {
auto& engine = get_test_engine();
auto input = engine.allocate_memory({ data_types::f16, format::bfyx, tensor{ 1, 3, 3, 1 } });
set_values(input, {
FLOAT16(0), FLOAT16(1), FLOAT16(8),
FLOAT16(5), FLOAT16(5), FLOAT16(2),
FLOAT16(7), FLOAT16(10), FLOAT16(4),
});
std::vector<int32_t> expected_results = {
4, 8, 1, 1,
};
do_count_non_zero_test(engine, input, expected_results);
}
TEST(gather_nonzero_gpu_fp16, test1) {
auto& engine = get_test_engine();
auto input = engine.allocate_memory({ data_types::f16, format::bfyx, tensor{ 1, 3, 3, 1 } });
auto output_shape = engine.allocate_memory({ data_types::i32, format::bfyx, tensor{ 1, 1, 4, 1 } });
set_values(input, {
FLOAT16(0), FLOAT16(1), FLOAT16(8),
FLOAT16(5), FLOAT16(5), FLOAT16(2),
FLOAT16(7), FLOAT16(10), FLOAT16(4),
});
set_values(output_shape, {
4, 8, 1, 1,
});
std::vector<int32_t> expected_results = {
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 2, 2, 2,
0, 0, 0, 0, 0, 0, 0, 0,
1, 2, 0, 1, 2, 0, 1, 2,
};
do_gather_non_zero_test(engine, input, output_shape, expected_results);
}
TEST(gather_nonzero_gpu_fp16, test2) {
auto& engine = get_test_engine();
auto input = engine.allocate_memory({ data_types::f16, format::bfyx, tensor{ 1, 3, 3, 1 } });
auto output_shape = engine.allocate_memory({ data_types::i32, format::bfyx, tensor{ 1, 1, 4, 1 } });
set_values(input, {
FLOAT16(0), FLOAT16(1), FLOAT16(8),
FLOAT16(5), FLOAT16(5), FLOAT16(0),
FLOAT16(7), FLOAT16(0), FLOAT16(4),
});
set_values(output_shape, {
4, 6, 1, 1,
});
std::vector<int32_t> expected_results = {
0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 2, 2,
0, 0, 0, 0, 0, 0,
1, 2, 0, 1, 0, 2,
};
do_gather_non_zero_test(engine, input, output_shape, expected_results);
}
TEST(nonzero_gpu_fp16, test1) {
auto& engine = get_test_engine();
auto input = engine.allocate_memory({ data_types::f16, format::bfyx, tensor{ 1, 3, 3, 1 } });
set_values(input, {
FLOAT16(0), FLOAT16(1), FLOAT16(8),
FLOAT16(5), FLOAT16(5), FLOAT16(0),
FLOAT16(7), FLOAT16(0), FLOAT16(4),
});
std::vector<int32_t> expected_shape = {
4, 6, 1, 1,
};
std::vector<int32_t> expected_results = {
0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 2, 2,
0, 0, 0, 0, 0, 0,
1, 2, 0, 1, 0, 2,
};
do_non_zero_test(engine, input, expected_shape, expected_results);
}
TEST(nonzero_gpu_fp16, test2) {
auto& engine = get_test_engine();
auto input = engine.allocate_memory({ data_types::f16, format::bfzyx, tensor{ 1, 3, 3, 3, 1 } });
set_values(input, {
FLOAT16(0), FLOAT16(1), FLOAT16(8),
FLOAT16(7), FLOAT16(10), FLOAT16(4),
FLOAT16(7), FLOAT16(0), FLOAT16(4),
FLOAT16(9), FLOAT16(5), FLOAT16(1),
FLOAT16(2), FLOAT16(0), FLOAT16(8),
FLOAT16(2), FLOAT16(10), FLOAT16(7),
FLOAT16(2), FLOAT16(4), FLOAT16(8),
FLOAT16(5), FLOAT16(9), FLOAT16(10),
FLOAT16(10), FLOAT16(5), FLOAT16(2),
});
std::vector<int32_t> expected_shape = {
5, 24, 1, 1,
};
std::vector<int32_t> expected_results = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 2, 2, 0, 0, 0, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2,
1, 2, 0, 1, 2, 0, 2, 0, 1, 2, 0, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2,
};
do_non_zero_test(engine, input, expected_shape, expected_results);
}
TEST(nonzero_gpu_fp16, test3) {
auto& engine = get_test_engine();
auto input = engine.allocate_memory({ data_types::f16, format::bfwzyx, tensor{ 1, 3, 3, 3, 3, 1 } });
set_values(input, {
FLOAT16(0), FLOAT16(1), FLOAT16(8), FLOAT16(7), FLOAT16(10), FLOAT16(4), FLOAT16(6), FLOAT16(5), FLOAT16(4),
FLOAT16(7), FLOAT16(0), FLOAT16(4), FLOAT16(9), FLOAT16(5), FLOAT16(1), FLOAT16(2), FLOAT16(2), FLOAT16(0),
FLOAT16(2), FLOAT16(0), FLOAT16(8), FLOAT16(2), FLOAT16(10), FLOAT16(7), FLOAT16(7), FLOAT16(0), FLOAT16(6),
FLOAT16(2), FLOAT16(4), FLOAT16(8), FLOAT16(5), FLOAT16(9), FLOAT16(10), FLOAT16(10), FLOAT16(5), FLOAT16(2),
FLOAT16(4), FLOAT16(8), FLOAT16(2), FLOAT16(1), FLOAT16(4), FLOAT16(10), FLOAT16(10), FLOAT16(2), FLOAT16(21),
FLOAT16(0), FLOAT16(1), FLOAT16(5), FLOAT16(1), FLOAT16(5), FLOAT16(1), FLOAT16(9), FLOAT16(4), FLOAT16(22),
FLOAT16(4), FLOAT16(3), FLOAT16(7), FLOAT16(6), FLOAT16(9), FLOAT16(8), FLOAT16(9), FLOAT16(7), FLOAT16(23),
FLOAT16(4), FLOAT16(10), FLOAT16(6), FLOAT16(3), FLOAT16(5), FLOAT16(5), FLOAT16(4), FLOAT16(2), FLOAT16(23),
FLOAT16(0), FLOAT16(4), FLOAT16(5), FLOAT16(3), FLOAT16(1), FLOAT16(2), FLOAT16(8), FLOAT16(5), FLOAT16(0),
});
std::vector<int32_t> expected_shape = {
6, 73, 1, 1,
};
std::vector<int32_t> expected_results = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 1, 1, 1, 2, 2, 0, 0, 1, 1, 1, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 1, 1, 1, 2, 2,
1, 2, 0, 1, 2, 0, 1, 2, 0, 2, 0, 1, 2, 0, 1, 0, 2, 0, 1, 2, 0, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 1, 2, 0, 1, 2, 0, 1,
};
do_non_zero_test(engine, input, expected_shape, expected_results);
}
TEST(nonzero_gpu_fp32, test1) {
auto& engine = get_test_engine();
auto input = engine.allocate_memory({ data_types::f32, format::bfyx, tensor{ 2, 3, 5, 4 } });
set_values(input, {
6, 6, 0, 3, 0, 4, 1, 0, 8, 4, 8, 5, 8, 6, 0, 2, 0, 9, 6, 9,
1, 2, 4, 9, 0, 8, 5, 7, 4, 6, 8, 0, 6, 2, 3, 5, 0, 9, 8, 7,
3, 6, 5, 3, 8, 4, 7, 5, 7, 8, 5, 2, 1, 8, 9, 2, 1, 4, 3, 3,
7, 3, 9, 9, 0, 2, 4, 0, 4, 9, 5, 9, 4, 5, 8, 1, 2, 9, 7, 6,
7, 9, 6, 7, 2, 9, 2, 7, 8, 3, 1, 2, 7, 4, 6, 2, 3, 7, 0, 5,
2, 3, 7, 7, 0, 3, 4, 0, 9, 0, 9, 0, 2, 7, 7, 8, 6, 6, 0, 8,
});
std::vector<int32_t> expected_shape = {
4, 104, 1, 1,
};
std::vector<int32_t> expected_results = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3,
0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3,
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3,
0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
0, 1, 3, 0, 1, 3, 4, 0, 1, 2, 3, 0, 2, 3, 4,
0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 2, 3, 4, 0, 2, 3, 4,
0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4,
0, 1, 2, 3, 0, 1, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4,
0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 4,
0, 1, 2, 3, 0, 1, 3, 0, 2, 3, 4, 0, 1, 2, 4,
};
do_non_zero_test(engine, input, expected_shape, expected_results);
}