[GPU] NonZero primitive (#12519)
This commit is contained in:
parent
03b7c1e69e
commit
dd55f434c3
@ -155,6 +155,7 @@ REGISTER_FACTORY(v3, EmbeddingBagOffsetsSum);
|
||||
REGISTER_FACTORY(v3, EmbeddingBagPackedSum);
|
||||
REGISTER_FACTORY(v3, EmbeddingSegmentsSum);
|
||||
REGISTER_FACTORY(v3, ExtractImagePatches);
|
||||
REGISTER_FACTORY(v3, NonZero);
|
||||
REGISTER_FACTORY(v3, ROIAlign);
|
||||
REGISTER_FACTORY(v3, ScatterUpdate);
|
||||
REGISTER_FACTORY(v3, ScatterElementsUpdate);
|
||||
|
@ -0,0 +1,39 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "primitive.hpp"
|
||||
|
||||
namespace cldnn {
|
||||
|
||||
struct count_nonzero : public primitive_base<count_nonzero> {
|
||||
CLDNN_DECLARE_PRIMITIVE(count_nonzero)
|
||||
|
||||
/// @brief Constructs count_nonzero primitive.
|
||||
/// @param id This primitive id.
|
||||
/// @param data Input data primitive id.
|
||||
count_nonzero(const primitive_id& id,
|
||||
const primitive_id& data,
|
||||
const primitive_id& ext_prim_id = "",
|
||||
const padding& output_padding = padding())
|
||||
: primitive_base(id, {data}, ext_prim_id, output_padding) {}
|
||||
};
|
||||
|
||||
struct gather_nonzero : public primitive_base<gather_nonzero> {
|
||||
CLDNN_DECLARE_PRIMITIVE(gather_nonzero)
|
||||
|
||||
/// @brief Constructs gather_nonzero primitive.
|
||||
/// @param id This primitive id.
|
||||
/// @param data Input data primitive id.
|
||||
/// @param output_shape Output shape [rank of data, number of nonzero elements]
|
||||
gather_nonzero(const primitive_id& id,
|
||||
const primitive_id& data,
|
||||
const primitive_id& output_shape,
|
||||
const primitive_id& ext_prim_id = "",
|
||||
const padding& output_padding = padding())
|
||||
: primitive_base(id, {data, output_shape}, ext_prim_id, output_padding) {}
|
||||
};
|
||||
|
||||
} // namespace cldnn
|
119
src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp
Normal file
119
src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp
Normal file
@ -0,0 +1,119 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "non_zero_inst.h"
|
||||
#include "primitive_base.hpp"
|
||||
#include "impls/implementation_map.hpp"
|
||||
#include "kernel_selector_helper.h"
|
||||
#include "non_zero/count_nonzero_kernel_ref.h"
|
||||
#include "non_zero/count_nonzero_kernel_selector.h"
|
||||
#include "non_zero/gather_nonzero_kernel_ref.h"
|
||||
#include "non_zero/gather_nonzero_kernel_selector.h"
|
||||
#include "intel_gpu/runtime/error_handler.hpp"
|
||||
|
||||
using namespace cldnn;
|
||||
|
||||
namespace cldnn {
|
||||
namespace ocl {
|
||||
|
||||
struct count_nonzero_impl : typed_primitive_impl_ocl<count_nonzero> {
|
||||
using parent = typed_primitive_impl_ocl<count_nonzero>;
|
||||
using parent::parent;
|
||||
|
||||
std::unique_ptr<primitive_impl> clone() const override {
|
||||
return make_unique<count_nonzero_impl>(*this);
|
||||
}
|
||||
|
||||
static primitive_impl* create(const count_nonzero_node& arg, const kernel_impl_params& impl_param) {
|
||||
auto nonzero_params = get_default_params<kernel_selector::count_nonzero_params>(impl_param);
|
||||
auto nonzero_optional_params =
|
||||
get_default_optional_params<kernel_selector::count_nonzero_optional_params>(arg.get_program());
|
||||
|
||||
auto& kernel_selector = kernel_selector::count_nonzero_kernel_selector::Instance();
|
||||
auto best_kernels = kernel_selector.GetBestKernels(nonzero_params, nonzero_optional_params);
|
||||
|
||||
OPENVINO_ASSERT(!best_kernels.empty(), "Cannot find a proper kernel for ", arg.id());
|
||||
|
||||
auto count_nonzero = new count_nonzero_impl(arg, best_kernels[0]);
|
||||
|
||||
return count_nonzero;
|
||||
}
|
||||
};
|
||||
|
||||
struct gather_nonzero_impl : typed_primitive_impl_ocl<gather_nonzero> {
|
||||
using parent = typed_primitive_impl_ocl<gather_nonzero>;
|
||||
using parent::parent;
|
||||
|
||||
std::unique_ptr<primitive_impl> clone() const override {
|
||||
return make_unique<gather_nonzero_impl>(*this);
|
||||
}
|
||||
|
||||
public:
|
||||
static primitive_impl* create(const gather_nonzero_node& arg, const kernel_impl_params& impl_param) {
|
||||
auto nonzero_params = get_default_params<kernel_selector::gather_nonzero_params>(impl_param);
|
||||
auto nonzero_optional_params =
|
||||
get_default_optional_params<kernel_selector::gather_nonzero_optional_params>(arg.get_program());
|
||||
|
||||
nonzero_params.inputs.push_back(convert_data_tensor(arg.input(1).get_output_layout()));
|
||||
|
||||
auto& kernel_selector = kernel_selector::gather_nonzero_kernel_selector::Instance();
|
||||
auto best_kernels = kernel_selector.GetBestKernels(nonzero_params, nonzero_optional_params);
|
||||
|
||||
OPENVINO_ASSERT(!best_kernels.empty(), "Cannot find a proper kernel for ", arg.id());
|
||||
|
||||
auto gather_nonzero = new gather_nonzero_impl(arg, best_kernels[0]);
|
||||
|
||||
return gather_nonzero;
|
||||
}
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
attach_count_nonzero_impl::attach_count_nonzero_impl() {
|
||||
implementation_map<count_nonzero>::add(impl_types::ocl, count_nonzero_impl::create, {
|
||||
std::make_tuple(data_types::f32, format::bfyx),
|
||||
std::make_tuple(data_types::f16, format::bfyx),
|
||||
std::make_tuple(data_types::i32, format::bfyx),
|
||||
std::make_tuple(data_types::i8, format::bfyx),
|
||||
std::make_tuple(data_types::u8, format::bfyx),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bfzyx),
|
||||
std::make_tuple(data_types::f16, format::bfzyx),
|
||||
std::make_tuple(data_types::i32, format::bfzyx),
|
||||
std::make_tuple(data_types::i8, format::bfzyx),
|
||||
std::make_tuple(data_types::u8, format::bfzyx),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bfwzyx),
|
||||
std::make_tuple(data_types::f16, format::bfwzyx),
|
||||
std::make_tuple(data_types::i32, format::bfwzyx),
|
||||
std::make_tuple(data_types::i8, format::bfwzyx),
|
||||
std::make_tuple(data_types::u8, format::bfwzyx),
|
||||
});
|
||||
}
|
||||
|
||||
attach_gather_nonzero_impl::attach_gather_nonzero_impl() {
|
||||
implementation_map<gather_nonzero>::add(impl_types::ocl, gather_nonzero_impl::create, {
|
||||
std::make_tuple(data_types::f32, format::bfyx),
|
||||
std::make_tuple(data_types::f16, format::bfyx),
|
||||
std::make_tuple(data_types::i32, format::bfyx),
|
||||
std::make_tuple(data_types::i8, format::bfyx),
|
||||
std::make_tuple(data_types::u8, format::bfyx),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bfzyx),
|
||||
std::make_tuple(data_types::f16, format::bfzyx),
|
||||
std::make_tuple(data_types::i32, format::bfzyx),
|
||||
std::make_tuple(data_types::i8, format::bfzyx),
|
||||
std::make_tuple(data_types::u8, format::bfzyx),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bfwzyx),
|
||||
std::make_tuple(data_types::f16, format::bfwzyx),
|
||||
std::make_tuple(data_types::i32, format::bfwzyx),
|
||||
std::make_tuple(data_types::i8, format::bfwzyx),
|
||||
std::make_tuple(data_types::u8, format::bfwzyx),
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
} // namespace ocl
|
||||
} // namespace cldnn
|
@ -90,6 +90,8 @@ void register_implementations() {
|
||||
REGISTER_OCL(embedding_bag);
|
||||
REGISTER_OCL(extract_image_patches);
|
||||
REGISTER_OCL(convert_color);
|
||||
REGISTER_OCL(count_nonzero);
|
||||
REGISTER_OCL(gather_nonzero);
|
||||
}
|
||||
|
||||
} // namespace ocl
|
||||
|
@ -73,6 +73,7 @@
|
||||
#include "intel_gpu/primitives/space_to_batch.hpp"
|
||||
#include "intel_gpu/primitives/strided_slice.hpp"
|
||||
#include "intel_gpu/primitives/tile.hpp"
|
||||
#include "intel_gpu/primitives/non_zero.hpp"
|
||||
|
||||
namespace cldnn {
|
||||
namespace ocl {
|
||||
@ -166,6 +167,8 @@ REGISTER_OCL(cum_sum);
|
||||
REGISTER_OCL(embedding_bag);
|
||||
REGISTER_OCL(extract_image_patches);
|
||||
REGISTER_OCL(convert_color);
|
||||
REGISTER_OCL(count_nonzero);
|
||||
REGISTER_OCL(gather_nonzero);
|
||||
|
||||
#undef REGISTER_OCL
|
||||
|
||||
|
81
src/plugins/intel_gpu/src/graph/include/non_zero_inst.h
Normal file
81
src/plugins/intel_gpu/src/graph/include/non_zero_inst.h
Normal file
@ -0,0 +1,81 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#pragma once
|
||||
#include "intel_gpu/primitives/non_zero.hpp"
|
||||
#include "primitive_inst.h"
|
||||
#include "intel_gpu/runtime/error_handler.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <memory>
|
||||
|
||||
namespace cldnn {
|
||||
|
||||
template <>
|
||||
struct typed_program_node<count_nonzero> : public typed_program_node_base<count_nonzero> {
|
||||
using parent = typed_program_node_base<count_nonzero>;
|
||||
typed_program_node(const std::shared_ptr<count_nonzero> prim, program& prog) : parent(prim, prog) {
|
||||
support_padding_all(true);
|
||||
}
|
||||
public:
|
||||
using parent::parent;
|
||||
|
||||
program_node& input() const {
|
||||
OPENVINO_ASSERT(dependencies.size() == 1, "[GPU] Primitive ", id(), " has invalid number of depndencies");
|
||||
return get_dependency(0);
|
||||
}
|
||||
};
|
||||
|
||||
using count_nonzero_node = typed_program_node<count_nonzero>;
|
||||
|
||||
template <>
|
||||
class typed_primitive_inst<count_nonzero> : public typed_primitive_inst_base<count_nonzero> {
|
||||
using parent = typed_primitive_inst_base<count_nonzero>;
|
||||
|
||||
public:
|
||||
static layout calc_output_layout(count_nonzero_node const& node, kernel_impl_params const& impl_param);
|
||||
static std::string to_string(count_nonzero_node const& node);
|
||||
|
||||
typed_primitive_inst(network& network, count_nonzero_node const& node);
|
||||
|
||||
private:
|
||||
void on_execute() override;
|
||||
};
|
||||
|
||||
using count_nonzero_inst = typed_primitive_inst<count_nonzero>;
|
||||
|
||||
template <>
|
||||
struct typed_program_node<gather_nonzero> : public typed_program_node_base<gather_nonzero> {
|
||||
using parent = typed_program_node_base<gather_nonzero>;
|
||||
typed_program_node(const std::shared_ptr<gather_nonzero> prim, program& prog) : parent(prim, prog) {
|
||||
support_padding_all(true);
|
||||
}
|
||||
|
||||
public:
|
||||
using parent::parent;
|
||||
|
||||
bool generates_dynamic_output() const override { return true; }
|
||||
|
||||
program_node& input(size_t index = 0) const {
|
||||
OPENVINO_ASSERT(dependencies.size() == 2, "[GPU] Primitive ", id(), " has invalid number of depndencies");
|
||||
return get_dependency(index);
|
||||
}
|
||||
};
|
||||
|
||||
using gather_nonzero_node = typed_program_node<gather_nonzero>;
|
||||
|
||||
template <>
|
||||
class typed_primitive_inst<gather_nonzero> : public typed_primitive_inst_base<gather_nonzero> {
|
||||
using parent = typed_primitive_inst_base<gather_nonzero>;
|
||||
|
||||
public:
|
||||
static layout calc_output_layout(gather_nonzero_node const& node, kernel_impl_params const& impl_param);
|
||||
static std::string to_string(gather_nonzero_node const& node);
|
||||
|
||||
typed_primitive_inst(network& network, gather_nonzero_node const& node);
|
||||
};
|
||||
|
||||
using gather_nonzero_inst = typed_primitive_inst<gather_nonzero>;
|
||||
} // namespace cldnn
|
93
src/plugins/intel_gpu/src/graph/non_zero.cpp
Normal file
93
src/plugins/intel_gpu/src/graph/non_zero.cpp
Normal file
@ -0,0 +1,93 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#include "non_zero_inst.h"
|
||||
#include "primitive_type_base.h"
|
||||
#include "intel_gpu/runtime/memory.hpp"
|
||||
#include "intel_gpu/runtime/error_handler.hpp"
|
||||
#include "intel_gpu/runtime/debug_configuration.hpp"
|
||||
#include "json_object.h"
|
||||
#include <string>
|
||||
|
||||
namespace cldnn {
|
||||
|
||||
// -----------------------------------------------
|
||||
// count_nonzero
|
||||
// -----------------------------------------------
|
||||
primitive_type_id count_nonzero::type_id() {
|
||||
static primitive_type_base<count_nonzero> instance;
|
||||
return &instance;
|
||||
}
|
||||
|
||||
layout count_nonzero_inst::calc_output_layout(count_nonzero_node const& node, kernel_impl_params const& impl_param) {
|
||||
assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
|
||||
"Output data type forcing is not supported for count_nonzero_node!");
|
||||
return layout{cldnn::data_types::i32, cldnn::format::bfyx, tensor{1, 1, 1, 4}};
|
||||
}
|
||||
|
||||
std::string count_nonzero_inst::to_string(count_nonzero_node const& node) {
|
||||
auto desc = node.get_primitive();
|
||||
auto node_info = node.desc_to_json();
|
||||
auto& input = node.input();
|
||||
|
||||
std::stringstream primitive_description;
|
||||
|
||||
json_composite count_nonzero_info;
|
||||
count_nonzero_info.add("input id", input.id());
|
||||
count_nonzero_info.add("output shape", tensor{1, 1, 1, 4});
|
||||
|
||||
node_info->add("count_nonzero info", count_nonzero_info);
|
||||
node_info->dump(primitive_description);
|
||||
|
||||
return primitive_description.str();
|
||||
}
|
||||
|
||||
count_nonzero_inst::typed_primitive_inst(network& network, count_nonzero_node const& node) : parent(network, node) {}
|
||||
|
||||
void count_nonzero_inst::on_execute() {
|
||||
output_memory().fill(_network.get_stream(), 0);
|
||||
}
|
||||
|
||||
// -----------------------------------------------
|
||||
// gather_nonzero
|
||||
// -----------------------------------------------
|
||||
primitive_type_id gather_nonzero::type_id() {
|
||||
static primitive_type_base<gather_nonzero> instance;
|
||||
return &instance;
|
||||
}
|
||||
|
||||
layout gather_nonzero_inst::calc_output_layout(gather_nonzero_node const& node, kernel_impl_params const& impl_param) {
|
||||
assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
|
||||
"Output data type forcing is not supported for gather_nonzero_node!");
|
||||
if (impl_param.memory_deps.count(1)) {
|
||||
auto out_size = read_vector<int64_t>(impl_param.memory_deps.at(1), impl_param.prog.get_stream());
|
||||
ov::Shape output_shape(out_size.begin(), out_size.end());
|
||||
ov::PartialShape output_pshape(output_shape);
|
||||
return layout{output_pshape, cldnn::data_types::i32, cldnn::format::bfyx};
|
||||
} else {
|
||||
return layout{ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), 1, 1}), cldnn::data_types::i32, cldnn::format::bfyx};
|
||||
}
|
||||
}
|
||||
|
||||
std::string gather_nonzero_inst::to_string(gather_nonzero_node const& node) {
|
||||
auto desc = node.get_primitive();
|
||||
auto node_info = node.desc_to_json();
|
||||
auto& input = node.input();
|
||||
|
||||
std::stringstream primitive_description;
|
||||
|
||||
json_composite gather_nonzero_info;
|
||||
gather_nonzero_info.add("input id", input.id());
|
||||
gather_nonzero_info.add("output layout", node.get_output_layout().to_string());
|
||||
|
||||
node_info->add("gather_nonzero info", gather_nonzero_info);
|
||||
node_info->dump(primitive_description);
|
||||
|
||||
return primitive_description.str();
|
||||
}
|
||||
|
||||
gather_nonzero_inst::typed_primitive_inst(network& network, gather_nonzero_node const& node) : parent(network, node, false) {}
|
||||
|
||||
} // namespace cldnn
|
@ -32,6 +32,8 @@ enum class KernelType {
|
||||
SCALE,
|
||||
REORDER,
|
||||
RESHAPE,
|
||||
COUNT_NONZERO,
|
||||
GATHER_NONZERO,
|
||||
PERMUTE,
|
||||
CONCATENATION,
|
||||
RESAMPLE,
|
||||
|
@ -0,0 +1,92 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "count_nonzero_kernel_ref.h"
|
||||
#include "kernel_selector_utils.h"
|
||||
#include <string>
|
||||
|
||||
namespace kernel_selector {
|
||||
ParamsKey CountNonzeroKernelRef::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::F16);
|
||||
k.EnableInputDataType(Datatype::F32);
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableInputDataType(Datatype::UINT8);
|
||||
k.EnableInputDataType(Datatype::INT32);
|
||||
k.EnableInputDataType(Datatype::UINT32);
|
||||
k.EnableInputDataType(Datatype::INT64);
|
||||
k.EnableOutputDataType(Datatype::F16);
|
||||
k.EnableOutputDataType(Datatype::F32);
|
||||
k.EnableOutputDataType(Datatype::UINT8);
|
||||
k.EnableOutputDataType(Datatype::INT8);
|
||||
k.EnableOutputDataType(Datatype::INT32);
|
||||
k.EnableOutputDataType(Datatype::UINT32);
|
||||
k.EnableOutputDataType(Datatype::INT64);
|
||||
k.EnableAllInputLayout();
|
||||
k.EnableAllOutputLayout();
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableBatching();
|
||||
k.EnableDifferentTypes();
|
||||
return k;
|
||||
}
|
||||
|
||||
KernelsData CountNonzeroKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
|
||||
assert(params.GetType() == KernelType::COUNT_NONZERO);
|
||||
|
||||
KernelData kd = KernelData::Default<count_nonzero_params>(params);
|
||||
count_nonzero_params& newParams = *static_cast<count_nonzero_params*>(kd.params.get());
|
||||
|
||||
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, params, options);
|
||||
auto cldnn_jit = MakeBaseParamsJitConstants(newParams);
|
||||
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
|
||||
|
||||
const auto& in = newParams.inputs[0];
|
||||
auto& kernel = kd.kernels[0];
|
||||
const auto& in_dims = in.GetDims();
|
||||
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws;
|
||||
|
||||
if (in_dims.size() == 4) {
|
||||
kernel.params.workGroups.global = {in_dims[0].v, in_dims[1].v, in_dims[2].v * in_dims[3].v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
} else if (in_dims.size() == 5) {
|
||||
kernel.params.workGroups.global = {in_dims[0].v, in_dims[1].v * in_dims[2].v, in_dims[3].v * in_dims[4].v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y, Tensor::DataChannelName::Z},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
} else {
|
||||
kernel.params.workGroups.global = {in_dims[0].v * in_dims[1].v, in_dims[2].v * in_dims[3].v, in_dims[4].v * in_dims[5].v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y},
|
||||
{Tensor::DataChannelName::Z, Tensor::DataChannelName::W},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
}
|
||||
|
||||
kernel.params.workGroups.local = GetOptimalLocalWorkGroupSizes(kernel.params.workGroups.global,
|
||||
params.engineInfo,
|
||||
newParams.inputs[0].GetLayout(),
|
||||
newParams.outputs[0].GetLayout(),
|
||||
dims_by_gws);
|
||||
|
||||
kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT);
|
||||
kernel.params.arguments = GetArgsDesc(1, false, false);
|
||||
|
||||
return {kd};
|
||||
}
|
||||
|
||||
KernelsPriority CountNonzeroKernelRef::GetKernelsPriority(const Params& /*params*/, const optional_params& /*options*/) const {
|
||||
return DONT_USE_IF_HAVE_SOMETHING_ELSE;
|
||||
}
|
||||
|
||||
bool CountNonzeroKernelRef::Validate(const Params& p, const optional_params& op) const {
|
||||
if (!KernelBaseOpenCL::Validate(p, op))
|
||||
return false;
|
||||
|
||||
const auto& rp = static_cast<const count_nonzero_params&>(p);
|
||||
|
||||
return Tensor::SimpleLayout(rp.inputs[0].GetLayout());
|
||||
}
|
||||
} // namespace kernel_selector
|
@ -0,0 +1,36 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "kernel_base_opencl.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// count_nonzero_params
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
struct count_nonzero_params : public base_params {
|
||||
count_nonzero_params() : base_params(KernelType::COUNT_NONZERO) {}
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// count_nonzero_optional_params
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
struct count_nonzero_optional_params : optional_params {
|
||||
count_nonzero_optional_params() : optional_params(KernelType::COUNT_NONZERO) {}
|
||||
};
|
||||
|
||||
class CountNonzeroKernelRef : public KernelBaseOpenCL {
|
||||
public:
|
||||
CountNonzeroKernelRef() : KernelBaseOpenCL("count_nonzero_ref") {}
|
||||
virtual ~CountNonzeroKernelRef() {}
|
||||
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
|
||||
protected:
|
||||
bool Validate(const Params& p, const optional_params& op) const override;
|
||||
};
|
||||
} // namespace kernel_selector
|
@ -0,0 +1,15 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "count_nonzero_kernel_selector.h"
|
||||
#include "count_nonzero_kernel_ref.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
count_nonzero_kernel_selector::count_nonzero_kernel_selector() { Attach<CountNonzeroKernelRef>(); }
|
||||
|
||||
KernelsData count_nonzero_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
|
||||
return GetNaiveBestKernel(params, options, KernelType::COUNT_NONZERO);
|
||||
}
|
||||
} // namespace kernel_selector
|
@ -0,0 +1,23 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "kernel_selector.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
class count_nonzero_kernel_selector : public kernel_selector_base {
|
||||
public:
|
||||
static count_nonzero_kernel_selector& Instance() {
|
||||
static count_nonzero_kernel_selector instance_;
|
||||
return instance_;
|
||||
}
|
||||
|
||||
count_nonzero_kernel_selector();
|
||||
|
||||
virtual ~count_nonzero_kernel_selector() {}
|
||||
|
||||
KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
|
||||
};
|
||||
} // namespace kernel_selector
|
@ -0,0 +1,96 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "gather_nonzero_kernel_ref.h"
|
||||
#include "kernel_selector_utils.h"
|
||||
#include <string>
|
||||
|
||||
namespace kernel_selector {
|
||||
ParamsKey GatherNonzeroKernelRef::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::F16);
|
||||
k.EnableInputDataType(Datatype::F32);
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableInputDataType(Datatype::UINT8);
|
||||
k.EnableInputDataType(Datatype::INT32);
|
||||
k.EnableInputDataType(Datatype::UINT32);
|
||||
k.EnableInputDataType(Datatype::INT64);
|
||||
k.EnableOutputDataType(Datatype::F16);
|
||||
k.EnableOutputDataType(Datatype::F32);
|
||||
k.EnableOutputDataType(Datatype::UINT8);
|
||||
k.EnableOutputDataType(Datatype::INT8);
|
||||
k.EnableOutputDataType(Datatype::INT32);
|
||||
k.EnableOutputDataType(Datatype::UINT32);
|
||||
k.EnableOutputDataType(Datatype::INT64);
|
||||
k.EnableInputLayout(DataLayout::bfyx);
|
||||
k.EnableOutputLayout(DataLayout::bfyx);
|
||||
k.EnableInputLayout(DataLayout::bfzyx);
|
||||
k.EnableOutputLayout(DataLayout::bfzyx);
|
||||
k.EnableInputLayout(DataLayout::bfwzyx);
|
||||
k.EnableOutputLayout(DataLayout::bfwzyx);
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableBatching();
|
||||
k.EnableDifferentTypes();
|
||||
return k;
|
||||
}
|
||||
|
||||
KernelsData GatherNonzeroKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
|
||||
assert(params.GetType() == KernelType::GATHER_NONZERO);
|
||||
|
||||
KernelData kd = KernelData::Default<gather_nonzero_params>(params);
|
||||
gather_nonzero_params& newParams = *static_cast<gather_nonzero_params*>(kd.params.get());
|
||||
|
||||
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, params, options);
|
||||
auto cldnn_jit = MakeBaseParamsJitConstants(newParams);
|
||||
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
|
||||
|
||||
const auto& in = newParams.inputs[0];
|
||||
auto& kernel = kd.kernels[0];
|
||||
const auto& in_dims = in.GetDims();
|
||||
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws;
|
||||
|
||||
if (in_dims.size() == 4) {
|
||||
kernel.params.workGroups.global = {in_dims[0].v, in_dims[1].v, in_dims[2].v * in_dims[3].v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
} else if (in_dims.size() == 5) {
|
||||
kernel.params.workGroups.global = {in_dims[0].v, in_dims[1].v * in_dims[2].v, in_dims[3].v * in_dims[4].v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y, Tensor::DataChannelName::Z},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
} else {
|
||||
kernel.params.workGroups.global = {in_dims[0].v * in_dims[1].v, in_dims[2].v * in_dims[3].v, in_dims[4].v * in_dims[5].v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y},
|
||||
{Tensor::DataChannelName::Z, Tensor::DataChannelName::W},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
}
|
||||
|
||||
kernel.params.workGroups.local = GetOptimalLocalWorkGroupSizes(kernel.params.workGroups.global,
|
||||
params.engineInfo,
|
||||
newParams.inputs[0].GetLayout(),
|
||||
newParams.outputs[0].GetLayout(),
|
||||
dims_by_gws);
|
||||
|
||||
kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT);
|
||||
kernel.params.arguments = GetArgsDesc(2, false, false);
|
||||
|
||||
return {kd};
|
||||
}
|
||||
|
||||
KernelsPriority GatherNonzeroKernelRef::GetKernelsPriority(const Params& /*params*/, const optional_params& /*options*/) const {
|
||||
return DONT_USE_IF_HAVE_SOMETHING_ELSE;
|
||||
}
|
||||
|
||||
bool GatherNonzeroKernelRef::Validate(const Params& p, const optional_params& op) const {
|
||||
if (!KernelBaseOpenCL::Validate(p, op))
|
||||
return false;
|
||||
|
||||
const auto& rp = static_cast<const gather_nonzero_params&>(p);
|
||||
|
||||
return Tensor::SimpleLayout(rp.inputs[0].GetLayout());
|
||||
}
|
||||
} // namespace kernel_selector
|
@ -0,0 +1,36 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "kernel_base_opencl.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// gather_nonzero_params
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
struct gather_nonzero_params : public base_params {
|
||||
gather_nonzero_params() : base_params(KernelType::GATHER_NONZERO) {}
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// gather_nonzero_optional_params
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
struct gather_nonzero_optional_params : optional_params {
|
||||
gather_nonzero_optional_params() : optional_params(KernelType::GATHER_NONZERO) {}
|
||||
};
|
||||
|
||||
class GatherNonzeroKernelRef : public KernelBaseOpenCL {
|
||||
public:
|
||||
GatherNonzeroKernelRef() : KernelBaseOpenCL("gather_nonzero_ref") {}
|
||||
virtual ~GatherNonzeroKernelRef() {}
|
||||
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
|
||||
protected:
|
||||
bool Validate(const Params& p, const optional_params& op) const override;
|
||||
};
|
||||
} // namespace kernel_selector
|
@ -0,0 +1,15 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "gather_nonzero_kernel_selector.h"
|
||||
#include "gather_nonzero_kernel_ref.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
gather_nonzero_kernel_selector::gather_nonzero_kernel_selector() { Attach<GatherNonzeroKernelRef>(); }
|
||||
|
||||
KernelsData gather_nonzero_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
|
||||
return GetNaiveBestKernel(params, options, KernelType::GATHER_NONZERO);
|
||||
}
|
||||
} // namespace kernel_selector
|
@ -0,0 +1,23 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "kernel_selector.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
class gather_nonzero_kernel_selector : public kernel_selector_base {
|
||||
public:
|
||||
static gather_nonzero_kernel_selector& Instance() {
|
||||
static gather_nonzero_kernel_selector instance_;
|
||||
return instance_;
|
||||
}
|
||||
|
||||
gather_nonzero_kernel_selector();
|
||||
|
||||
virtual ~gather_nonzero_kernel_selector() {}
|
||||
|
||||
KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
|
||||
};
|
||||
} // namespace kernel_selector
|
@ -0,0 +1,51 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/common.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
|
||||
#define INPUT0_GET_INDEX1(idx_order) INPUT0_GET_INDEX(idx_order)
|
||||
|
||||
KERNEL (count_nonzero_ref)(const __global INPUT0_TYPE* input,
|
||||
volatile __global OUTPUT_TYPE* output)
|
||||
{
|
||||
const uint gdim0 = (uint)get_global_id(0);
|
||||
const uint gdim1 = (uint)get_global_id(1);
|
||||
const uint gdim2 = (uint)get_global_id(2);
|
||||
|
||||
#if INPUT0_DIMS == 6
|
||||
#define INPUT_ORDER b,f,w,z,y,x
|
||||
const uint x = gdim0 % INPUT0_SIZE_X;
|
||||
const uint y = gdim0 / INPUT0_SIZE_X;
|
||||
const uint z = gdim1 % INPUT0_SIZE_Z;
|
||||
const uint w = gdim1 / INPUT0_SIZE_Z;
|
||||
#elif INPUT0_DIMS == 5
|
||||
#define INPUT_ORDER b,f,z,y,x
|
||||
const uint x = gdim0;
|
||||
const uint y = gdim1 % INPUT0_SIZE_Y;
|
||||
const uint z = gdim1 / INPUT0_SIZE_Y;
|
||||
#elif INPUT0_DIMS == 4
|
||||
#define INPUT_ORDER b,f,y,x
|
||||
const uint x = gdim0;
|
||||
const uint y = gdim1;
|
||||
#endif
|
||||
|
||||
const uint f = gdim2 % INPUT0_FEATURE_NUM;
|
||||
const uint b = gdim2 / INPUT0_FEATURE_NUM;
|
||||
|
||||
uint count = (input[INPUT0_GET_INDEX1(INPUT_ORDER)] == INPUT0_VAL_ZERO) ? 0 : 1;
|
||||
count = sub_group_reduce_add(count);
|
||||
|
||||
if (get_sub_group_local_id() == 0)
|
||||
atomic_add(&(output[1]), count);
|
||||
|
||||
if (gdim0 == 0 && gdim1 == 0 && gdim2 == 0) {
|
||||
output[0] = INPUT0_DIMS;
|
||||
output[2] = 1;
|
||||
output[3] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
#undef INPUT0_GET_INDEX1
|
||||
#undef INPUT_ORDER
|
@ -0,0 +1,87 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/common.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
|
||||
#define INPUT0_GET_INDEX1(idx_order) INPUT0_GET_INDEX(idx_order)
|
||||
|
||||
KERNEL (gather_nonzero_ref)(const __global INPUT0_TYPE* input,
|
||||
volatile __global INPUT1_TYPE* output_shape,
|
||||
__global OUTPUT_TYPE* output)
|
||||
{
|
||||
const uint gdim0 = (uint)get_global_id(0);
|
||||
const uint gdim1 = (uint)get_global_id(1);
|
||||
const uint gdim2 = (uint)get_global_id(2);
|
||||
|
||||
#if INPUT0_DIMS == 6
|
||||
#define INPUT_ORDER b,f,w,z,y,x
|
||||
const uint x = gdim0 % INPUT0_SIZE_X;
|
||||
const uint y = gdim0 / INPUT0_SIZE_X;
|
||||
const uint z = gdim1 % INPUT0_SIZE_Z;
|
||||
const uint w = gdim1 / INPUT0_SIZE_Z;
|
||||
#elif INPUT0_DIMS == 5
|
||||
#define INPUT_ORDER b,f,z,y,x
|
||||
const uint x = gdim0;
|
||||
const uint y = gdim1 % INPUT0_SIZE_Y;
|
||||
const uint z = gdim1 / INPUT0_SIZE_Y;
|
||||
#elif INPUT0_DIMS == 4
|
||||
#define INPUT_ORDER b,f,y,x
|
||||
const uint x = gdim0;
|
||||
const uint y = gdim1;
|
||||
#endif
|
||||
|
||||
const uint f = gdim2 % INPUT0_FEATURE_NUM;
|
||||
const uint b = gdim2 / INPUT0_FEATURE_NUM;
|
||||
|
||||
int num_nonzero_acc = (input[INPUT0_GET_INDEX1(INPUT_ORDER)] == INPUT0_VAL_ZERO) ? 0 : 1;
|
||||
num_nonzero_acc = sub_group_scan_inclusive_add(num_nonzero_acc);
|
||||
|
||||
int pos;
|
||||
|
||||
if (get_sub_group_local_id() == (get_sub_group_size() - 1)) {
|
||||
pos = atomic_add(&(output_shape[2]), num_nonzero_acc);
|
||||
pos = pos - 1;
|
||||
}
|
||||
|
||||
pos = sub_group_broadcast(pos, (get_sub_group_size() - 1));
|
||||
|
||||
// output_shape = {rank, # nonzero, 1, 1}
|
||||
if (input[INPUT0_GET_INDEX1(INPUT_ORDER)] != INPUT0_VAL_ZERO) {
|
||||
const int num_nonzero = output_shape[1];
|
||||
|
||||
pos = pos + num_nonzero_acc - 1;
|
||||
|
||||
int output_b = pos;
|
||||
int output_f = pos + num_nonzero;
|
||||
|
||||
output[output_b] = b;
|
||||
output[output_f] = f;
|
||||
|
||||
#if INPUT0_DIMS == 6
|
||||
int output_w = pos + num_nonzero * 2;
|
||||
int output_z = pos + num_nonzero * 3;
|
||||
int output_y = pos + num_nonzero * 4;
|
||||
int output_x = pos + num_nonzero * 5;
|
||||
|
||||
output[output_w] = w;
|
||||
output[output_z] = z;
|
||||
#elif INPUT0_DIMS == 5
|
||||
int output_z = pos + num_nonzero * 2;
|
||||
int output_y = pos + num_nonzero * 3;
|
||||
int output_x = pos + num_nonzero * 4;
|
||||
|
||||
output[output_z] = z;
|
||||
#elif INPUT0_DIMS == 4
|
||||
int output_y = pos + num_nonzero * 2;
|
||||
int output_x = pos + num_nonzero * 3;
|
||||
#endif
|
||||
|
||||
output[output_y] = y;
|
||||
output[output_x] = x;
|
||||
}
|
||||
}
|
||||
|
||||
#undef INPUT0_GET_INDEX1
|
||||
#undef INPUT_ORDER
|
38
src/plugins/intel_gpu/src/plugin/ops/non_zero.cpp
Normal file
38
src/plugins/intel_gpu/src/plugin/ops/non_zero.cpp
Normal file
@ -0,0 +1,38 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "intel_gpu/plugin/program.hpp"
|
||||
#include "intel_gpu/plugin/common_utils.hpp"
|
||||
|
||||
#include "ngraph/op/non_zero.hpp"
|
||||
|
||||
#include "intel_gpu/primitives/non_zero.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_gpu {
|
||||
|
||||
static void CreateNonZeroOp(Program& p, const std::shared_ptr<ngraph::Node>& op) {
|
||||
p.ValidateInputs(op, {1});
|
||||
auto input_primitives = p.GetInputPrimitiveIDs(op);
|
||||
std::string layer_name = layer_type_name_ID(op);
|
||||
|
||||
cldnn::primitive_id count_prim_id = layer_name + "_count";
|
||||
auto count_prim = cldnn::count_nonzero(count_prim_id,
|
||||
input_primitives[0],
|
||||
op->get_friendly_name());
|
||||
|
||||
auto gather_prim = cldnn::gather_nonzero(layer_name,
|
||||
input_primitives[0],
|
||||
count_prim_id,
|
||||
op->get_friendly_name());
|
||||
|
||||
p.AddPrimitive(count_prim);
|
||||
p.AddPrimitive(gather_prim);
|
||||
p.AddPrimitiveToProfiler(op);
|
||||
}
|
||||
|
||||
REGISTER_FACTORY_IMPL(v3, NonZero);
|
||||
|
||||
} // namespace intel_gpu
|
||||
} // namespace ov
|
342
src/plugins/intel_gpu/tests/test_cases/non_zero_gpu_test.cpp
Normal file
342
src/plugins/intel_gpu/tests/test_cases/non_zero_gpu_test.cpp
Normal file
@ -0,0 +1,342 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "test_utils.h"
|
||||
|
||||
#include <intel_gpu/primitives/input_layout.hpp>
|
||||
#include <intel_gpu/primitives/non_zero.hpp>
|
||||
#include <intel_gpu/runtime/memory.hpp>
|
||||
#include <intel_gpu/graph/topology.hpp>
|
||||
#include <intel_gpu/graph/network.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
using namespace cldnn;
|
||||
using namespace ::tests;
|
||||
|
||||
inline void do_count_non_zero_test(engine& engine,
|
||||
const cldnn::memory::ptr& input_data,
|
||||
const std::vector<int32_t>& expected_results)
|
||||
{
|
||||
topology topology;
|
||||
topology.add(input_layout("InputData", input_data->get_layout()));
|
||||
topology.add(
|
||||
count_nonzero("count_nonzero", "InputData")
|
||||
);
|
||||
|
||||
network network(engine, topology);
|
||||
|
||||
network.set_input_data("InputData", input_data);
|
||||
auto outputs = network.execute();
|
||||
auto output = outputs.at("count_nonzero").get_memory();
|
||||
cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
|
||||
|
||||
for (size_t i = 0; i < expected_results.size(); ++i) {
|
||||
EXPECT_EQ(expected_results[i], output_ptr[i]);
|
||||
}
|
||||
}
|
||||
|
||||
inline void do_gather_non_zero_test(engine& engine,
|
||||
const cldnn::memory::ptr& input_data,
|
||||
const cldnn::memory::ptr& output_shape,
|
||||
const std::vector<int32_t>& expected_results)
|
||||
{
|
||||
topology topology;
|
||||
topology.add(input_layout("InputData", input_data->get_layout()));
|
||||
topology.add(input_layout("OutputShape", output_shape->get_layout()));
|
||||
topology.add(
|
||||
gather_nonzero("gather_nonzero", "InputData", "OutputShape")
|
||||
);
|
||||
|
||||
network network(engine, topology);
|
||||
|
||||
network.set_input_data("InputData", input_data);
|
||||
network.set_input_data("OutputShape", output_shape);
|
||||
auto outputs = network.execute();
|
||||
auto output = outputs.at("gather_nonzero").get_memory();
|
||||
cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
|
||||
cldnn::mem_lock<int32_t> shape_ptr(output_shape, get_test_stream());
|
||||
|
||||
int num_ranks = shape_ptr[0];
|
||||
int num_nonzero = shape_ptr[1];
|
||||
|
||||
for (int i = 0; i < num_nonzero; i++) {
|
||||
bool found = false;
|
||||
for (int j = 0; j < num_nonzero; j++) {
|
||||
for (int k = 0; k < num_ranks; k++) {
|
||||
if (output_ptr[i+num_nonzero*k] != expected_results[j+num_nonzero*k])
|
||||
break;
|
||||
|
||||
if (k == (num_ranks - 1)) {
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
if (found)
|
||||
break;
|
||||
}
|
||||
|
||||
EXPECT_TRUE(found);
|
||||
}
|
||||
}
|
||||
|
||||
inline void do_non_zero_test(engine& engine,
|
||||
const cldnn::memory::ptr& input_data,
|
||||
const std::vector<int32_t>& expected_shape,
|
||||
const std::vector<int32_t>& expected_results)
|
||||
{
|
||||
topology topology;
|
||||
topology.add(input_layout("InputData", input_data->get_layout()));
|
||||
topology.add(
|
||||
count_nonzero("count_nonzero", "InputData")
|
||||
);
|
||||
topology.add(
|
||||
gather_nonzero("gather_nonzero", "InputData", "count_nonzero")
|
||||
);
|
||||
|
||||
network network(engine, topology);
|
||||
|
||||
network.set_input_data("InputData", input_data);
|
||||
auto outputs = network.execute();
|
||||
auto output = outputs.at("gather_nonzero").get_memory();
|
||||
cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
|
||||
std::vector<int32_t> output_list = std::vector<int32_t>(output_ptr.begin(), output_ptr.end());
|
||||
|
||||
int num_ranks = expected_shape[0];
|
||||
int num_nonzero = expected_shape[1];
|
||||
|
||||
EXPECT_EQ(num_ranks*num_nonzero, output_list.size());
|
||||
|
||||
for (int i = 0; i < num_nonzero; i++) {
|
||||
bool found = false;
|
||||
for (int j = 0; j < num_nonzero; j++) {
|
||||
for (int k = 0; k < num_ranks; k++) {
|
||||
if (output_list[i+num_nonzero*k] != expected_results[j+num_nonzero*k])
|
||||
break;
|
||||
|
||||
if (k == (num_ranks - 1)) {
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
if (found)
|
||||
break;
|
||||
}
|
||||
|
||||
EXPECT_TRUE(found);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(count_nonzero_gpu_fp16, test1) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto input = engine.allocate_memory({ data_types::f16, format::bfyx, tensor{ 1, 3, 3, 1 } });
|
||||
|
||||
set_values(input, {
|
||||
FLOAT16(0), FLOAT16(1), FLOAT16(8),
|
||||
FLOAT16(5), FLOAT16(5), FLOAT16(2),
|
||||
FLOAT16(7), FLOAT16(10), FLOAT16(4),
|
||||
});
|
||||
|
||||
std::vector<int32_t> expected_results = {
|
||||
4, 8, 1, 1,
|
||||
};
|
||||
|
||||
do_count_non_zero_test(engine, input, expected_results);
|
||||
}
|
||||
|
||||
TEST(gather_nonzero_gpu_fp16, test1) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto input = engine.allocate_memory({ data_types::f16, format::bfyx, tensor{ 1, 3, 3, 1 } });
|
||||
auto output_shape = engine.allocate_memory({ data_types::i32, format::bfyx, tensor{ 1, 1, 4, 1 } });
|
||||
|
||||
set_values(input, {
|
||||
FLOAT16(0), FLOAT16(1), FLOAT16(8),
|
||||
FLOAT16(5), FLOAT16(5), FLOAT16(2),
|
||||
FLOAT16(7), FLOAT16(10), FLOAT16(4),
|
||||
});
|
||||
|
||||
set_values(output_shape, {
|
||||
4, 8, 1, 1,
|
||||
});
|
||||
|
||||
std::vector<int32_t> expected_results = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 1, 1, 1, 2, 2, 2,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
1, 2, 0, 1, 2, 0, 1, 2,
|
||||
};
|
||||
|
||||
do_gather_non_zero_test(engine, input, output_shape, expected_results);
|
||||
}
|
||||
|
||||
TEST(gather_nonzero_gpu_fp16, test2) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto input = engine.allocate_memory({ data_types::f16, format::bfyx, tensor{ 1, 3, 3, 1 } });
|
||||
auto output_shape = engine.allocate_memory({ data_types::i32, format::bfyx, tensor{ 1, 1, 4, 1 } });
|
||||
|
||||
set_values(input, {
|
||||
FLOAT16(0), FLOAT16(1), FLOAT16(8),
|
||||
FLOAT16(5), FLOAT16(5), FLOAT16(0),
|
||||
FLOAT16(7), FLOAT16(0), FLOAT16(4),
|
||||
});
|
||||
|
||||
set_values(output_shape, {
|
||||
4, 6, 1, 1,
|
||||
});
|
||||
|
||||
std::vector<int32_t> expected_results = {
|
||||
0, 0, 0, 0, 0, 0,
|
||||
0, 0, 1, 1, 2, 2,
|
||||
0, 0, 0, 0, 0, 0,
|
||||
1, 2, 0, 1, 0, 2,
|
||||
};
|
||||
|
||||
do_gather_non_zero_test(engine, input, output_shape, expected_results);
|
||||
}
|
||||
|
||||
TEST(nonzero_gpu_fp16, test1) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto input = engine.allocate_memory({ data_types::f16, format::bfyx, tensor{ 1, 3, 3, 1 } });
|
||||
|
||||
set_values(input, {
|
||||
FLOAT16(0), FLOAT16(1), FLOAT16(8),
|
||||
FLOAT16(5), FLOAT16(5), FLOAT16(0),
|
||||
FLOAT16(7), FLOAT16(0), FLOAT16(4),
|
||||
});
|
||||
|
||||
std::vector<int32_t> expected_shape = {
|
||||
4, 6, 1, 1,
|
||||
};
|
||||
|
||||
std::vector<int32_t> expected_results = {
|
||||
0, 0, 0, 0, 0, 0,
|
||||
0, 0, 1, 1, 2, 2,
|
||||
0, 0, 0, 0, 0, 0,
|
||||
1, 2, 0, 1, 0, 2,
|
||||
};
|
||||
|
||||
do_non_zero_test(engine, input, expected_shape, expected_results);
|
||||
}
|
||||
|
||||
TEST(nonzero_gpu_fp16, test2) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto input = engine.allocate_memory({ data_types::f16, format::bfzyx, tensor{ 1, 3, 3, 3, 1 } });
|
||||
|
||||
set_values(input, {
|
||||
FLOAT16(0), FLOAT16(1), FLOAT16(8),
|
||||
FLOAT16(7), FLOAT16(10), FLOAT16(4),
|
||||
FLOAT16(7), FLOAT16(0), FLOAT16(4),
|
||||
FLOAT16(9), FLOAT16(5), FLOAT16(1),
|
||||
FLOAT16(2), FLOAT16(0), FLOAT16(8),
|
||||
FLOAT16(2), FLOAT16(10), FLOAT16(7),
|
||||
FLOAT16(2), FLOAT16(4), FLOAT16(8),
|
||||
FLOAT16(5), FLOAT16(9), FLOAT16(10),
|
||||
FLOAT16(10), FLOAT16(5), FLOAT16(2),
|
||||
});
|
||||
|
||||
std::vector<int32_t> expected_shape = {
|
||||
5, 24, 1, 1,
|
||||
};
|
||||
|
||||
std::vector<int32_t> expected_results = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 1, 1, 1, 2, 2, 0, 0, 0, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2,
|
||||
1, 2, 0, 1, 2, 0, 2, 0, 1, 2, 0, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2,
|
||||
};
|
||||
|
||||
do_non_zero_test(engine, input, expected_shape, expected_results);
|
||||
}
|
||||
|
||||
TEST(nonzero_gpu_fp16, test3) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto input = engine.allocate_memory({ data_types::f16, format::bfwzyx, tensor{ 1, 3, 3, 3, 3, 1 } });
|
||||
|
||||
set_values(input, {
|
||||
FLOAT16(0), FLOAT16(1), FLOAT16(8), FLOAT16(7), FLOAT16(10), FLOAT16(4), FLOAT16(6), FLOAT16(5), FLOAT16(4),
|
||||
FLOAT16(7), FLOAT16(0), FLOAT16(4), FLOAT16(9), FLOAT16(5), FLOAT16(1), FLOAT16(2), FLOAT16(2), FLOAT16(0),
|
||||
FLOAT16(2), FLOAT16(0), FLOAT16(8), FLOAT16(2), FLOAT16(10), FLOAT16(7), FLOAT16(7), FLOAT16(0), FLOAT16(6),
|
||||
FLOAT16(2), FLOAT16(4), FLOAT16(8), FLOAT16(5), FLOAT16(9), FLOAT16(10), FLOAT16(10), FLOAT16(5), FLOAT16(2),
|
||||
FLOAT16(4), FLOAT16(8), FLOAT16(2), FLOAT16(1), FLOAT16(4), FLOAT16(10), FLOAT16(10), FLOAT16(2), FLOAT16(21),
|
||||
FLOAT16(0), FLOAT16(1), FLOAT16(5), FLOAT16(1), FLOAT16(5), FLOAT16(1), FLOAT16(9), FLOAT16(4), FLOAT16(22),
|
||||
FLOAT16(4), FLOAT16(3), FLOAT16(7), FLOAT16(6), FLOAT16(9), FLOAT16(8), FLOAT16(9), FLOAT16(7), FLOAT16(23),
|
||||
FLOAT16(4), FLOAT16(10), FLOAT16(6), FLOAT16(3), FLOAT16(5), FLOAT16(5), FLOAT16(4), FLOAT16(2), FLOAT16(23),
|
||||
FLOAT16(0), FLOAT16(4), FLOAT16(5), FLOAT16(3), FLOAT16(1), FLOAT16(2), FLOAT16(8), FLOAT16(5), FLOAT16(0),
|
||||
});
|
||||
|
||||
std::vector<int32_t> expected_shape = {
|
||||
6, 73, 1, 1,
|
||||
};
|
||||
|
||||
std::vector<int32_t> expected_results = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
|
||||
0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 1, 1, 1, 2, 2, 0, 0, 1, 1, 1, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 1, 1, 1, 2, 2,
|
||||
1, 2, 0, 1, 2, 0, 1, 2, 0, 2, 0, 1, 2, 0, 1, 0, 2, 0, 1, 2, 0, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 1, 2, 0, 1, 2, 0, 1,
|
||||
};
|
||||
|
||||
do_non_zero_test(engine, input, expected_shape, expected_results);
|
||||
}
|
||||
|
||||
TEST(nonzero_gpu_fp32, test1) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto input = engine.allocate_memory({ data_types::f32, format::bfyx, tensor{ 2, 3, 5, 4 } });
|
||||
|
||||
set_values(input, {
|
||||
6, 6, 0, 3, 0, 4, 1, 0, 8, 4, 8, 5, 8, 6, 0, 2, 0, 9, 6, 9,
|
||||
1, 2, 4, 9, 0, 8, 5, 7, 4, 6, 8, 0, 6, 2, 3, 5, 0, 9, 8, 7,
|
||||
3, 6, 5, 3, 8, 4, 7, 5, 7, 8, 5, 2, 1, 8, 9, 2, 1, 4, 3, 3,
|
||||
|
||||
7, 3, 9, 9, 0, 2, 4, 0, 4, 9, 5, 9, 4, 5, 8, 1, 2, 9, 7, 6,
|
||||
7, 9, 6, 7, 2, 9, 2, 7, 8, 3, 1, 2, 7, 4, 6, 2, 3, 7, 0, 5,
|
||||
2, 3, 7, 7, 0, 3, 4, 0, 9, 0, 9, 0, 2, 7, 7, 8, 6, 6, 0, 8,
|
||||
});
|
||||
|
||||
std::vector<int32_t> expected_shape = {
|
||||
4, 104, 1, 1,
|
||||
};
|
||||
|
||||
std::vector<int32_t> expected_results = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
|
||||
|
||||
0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
|
||||
0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
|
||||
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3,
|
||||
0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3,
|
||||
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3,
|
||||
0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
|
||||
|
||||
|
||||
0, 1, 3, 0, 1, 3, 4, 0, 1, 2, 3, 0, 2, 3, 4,
|
||||
0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 2, 3, 4, 0, 2, 3, 4,
|
||||
0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4,
|
||||
0, 1, 2, 3, 0, 1, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4,
|
||||
0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 4,
|
||||
0, 1, 2, 3, 0, 1, 3, 0, 2, 3, 4, 0, 1, 2, 4,
|
||||
};
|
||||
|
||||
do_non_zero_test(engine, input, expected_shape, expected_results);
|
||||
}
|
Loading…
Reference in New Issue
Block a user