[GPU] Add new operation GatherND-8 (#8586)

Signed-off-by: Kelvin Choi <kelvin.choi@intel.com>
This commit is contained in:
Kelvin Choi
2021-11-18 13:24:02 +09:00
committed by GitHub
parent 1dbb8910e3
commit 517cf8dff6
14 changed files with 297 additions and 141 deletions

View File

@@ -211,6 +211,6 @@ REGISTER_FACTORY(v7, Gather);
// ------------------------------ Supported v8 ops ------------------------------ //
REGISTER_FACTORY(v8, Gather);
REGISTER_FACTORY(v8, GatherND);
// --------------------------- Supported internal ops --------------------------- //
REGISTER_FACTORY(internal, NonMaxSuppressionIEInternal);

View File

@@ -26,6 +26,7 @@ static void CreateGatherNDOp(Program& p, const std::shared_ptr<ngraph::op::v5::G
inputPrimitives[1],
indices_rank,
batch_dims,
true,
op->get_friendly_name());
p.AddPrimitive(primitive);
@@ -34,4 +35,27 @@ static void CreateGatherNDOp(Program& p, const std::shared_ptr<ngraph::op::v5::G
REGISTER_FACTORY_IMPL(v5, GatherND);
static void CreateGatherNDOp(Program& p, const std::shared_ptr<ngraph::op::v8::GatherND>& op) {
p.ValidateInputs(op, { 2 });
auto inputPrimitives = p.GetInputPrimitiveIDs(op);
std::string layerName = layer_type_name_ID(op);
int32_t indices_rank = static_cast<int32_t>(op->get_input_shape(1).size());
auto batch_dims = op->get_batch_dims();
auto primitive = cldnn::gather_nd(layerName,
inputPrimitives[0],
inputPrimitives[1],
indices_rank,
batch_dims,
false,
op->get_friendly_name());
p.AddPrimitive(primitive);
p.AddPrimitiveToProfiler(op);
}
REGISTER_FACTORY_IMPL(v8, GatherND);
} // namespace CLDNNPlugin

View File

@@ -3,7 +3,6 @@
//
#include <vector>
#include <ngraph/opsets/opset5.hpp>
#include "single_layer_tests/gather_nd.hpp"
#include "common_test_utils/test_constants.hpp"
@@ -33,15 +32,6 @@ const auto gatherNDArgsSubset1 = ::testing::Combine(
::testing::ValuesIn(std::vector<int>({ 0, 1 })) // Batch dims
);
INSTANTIATE_TEST_SUITE_P(smoke_GatherND_set1, GatherNDLayerTest,
::testing::Combine(
gatherNDArgsSubset1,
::testing::ValuesIn(inputPrecisions),
::testing::ValuesIn(idxPrecisions),
::testing::Values(CommonTestUtils::DEVICE_GPU),
::testing::Values<Config>({})),
GatherNDLayerTest::getTestCaseName);
// set2
const auto gatherNDArgsSubset2 = ::testing::Combine(
::testing::ValuesIn(std::vector<std::vector<size_t>>(
@@ -51,15 +41,6 @@ const auto gatherNDArgsSubset2 = ::testing::Combine(
::testing::ValuesIn(std::vector<int>({ 1, 2 })) // Batch dims
);
INSTANTIATE_TEST_SUITE_P(smoke_GatherND_set2, GatherNDLayerTest,
::testing::Combine(
gatherNDArgsSubset2,
::testing::ValuesIn(inputPrecisions),
::testing::ValuesIn(idxPrecisions),
::testing::Values(CommonTestUtils::DEVICE_GPU),
::testing::Values<Config>({})),
GatherNDLayerTest::getTestCaseName);
// set3
const auto gatherNDArgsSubset3 = ::testing::Combine(
::testing::ValuesIn(std::vector<std::vector<size_t>>(
@@ -69,7 +50,27 @@ const auto gatherNDArgsSubset3 = ::testing::Combine(
::testing::ValuesIn(std::vector<int>({ 3, 4 })) // Batch dims
);
INSTANTIATE_TEST_SUITE_P(smoke_GatherND_set3, GatherNDLayerTest,
// -------------------------------- V5 --------------------------------
INSTANTIATE_TEST_SUITE_P(smoke_GatherND5_set1, GatherNDLayerTest,
::testing::Combine(
gatherNDArgsSubset1,
::testing::ValuesIn(inputPrecisions),
::testing::ValuesIn(idxPrecisions),
::testing::Values(CommonTestUtils::DEVICE_GPU),
::testing::Values<Config>({})),
GatherNDLayerTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_GatherND5_set2, GatherNDLayerTest,
::testing::Combine(
gatherNDArgsSubset2,
::testing::ValuesIn(inputPrecisions),
::testing::ValuesIn(idxPrecisions),
::testing::Values(CommonTestUtils::DEVICE_GPU),
::testing::Values<Config>({})),
GatherNDLayerTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_GatherND5_set3, GatherNDLayerTest,
::testing::Combine(
gatherNDArgsSubset3,
::testing::ValuesIn(inputPrecisions),
@@ -78,4 +79,32 @@ INSTANTIATE_TEST_SUITE_P(smoke_GatherND_set3, GatherNDLayerTest,
::testing::Values<Config>({})),
GatherNDLayerTest::getTestCaseName);
// -------------------------------- V8 --------------------------------
INSTANTIATE_TEST_SUITE_P(smoke_GatherND8_set1, GatherND8LayerTest,
::testing::Combine(
gatherNDArgsSubset1,
::testing::ValuesIn(inputPrecisions),
::testing::ValuesIn(idxPrecisions),
::testing::Values(CommonTestUtils::DEVICE_GPU),
::testing::Values<Config>({})),
GatherND8LayerTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_GatherND8_set2, GatherND8LayerTest,
::testing::Combine(
gatherNDArgsSubset2,
::testing::ValuesIn(inputPrecisions),
::testing::ValuesIn(idxPrecisions),
::testing::Values(CommonTestUtils::DEVICE_GPU),
::testing::Values<Config>({})),
GatherND8LayerTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_GatherND8_set3, GatherND8LayerTest,
::testing::Combine(
gatherNDArgsSubset3,
::testing::ValuesIn(inputPrecisions),
::testing::ValuesIn(idxPrecisions),
::testing::Values(CommonTestUtils::DEVICE_GPU),
::testing::Values<Config>({})),
GatherND8LayerTest::getTestCaseName);
} // namespace

View File

@@ -9,6 +9,8 @@
#include <vector>
#include "shared_test_classes/base/layer_test_utils.hpp"
#include "ngraph_functions/builders.hpp"
#include "ngraph_functions/utils/ngraph_helpers.hpp"
namespace LayerTestsDefinitions {
using Config = std::map<std::string, std::string>;

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "ngraph_functions/builders.hpp"
#include "shared_test_classes/single_layer/gather_nd.hpp"
namespace LayerTestsDefinitions {
@@ -55,6 +54,7 @@ void GatherNDLayerTest::SetUp() {
function = std::make_shared<ngraph::Function>(results, params, "gatherND");
}
std::string GatherND8LayerTest::getTestCaseName(const testing::TestParamInfo<GatherNDParams>& obj) {
return GatherNDLayerTest::getTestCaseName(obj);
}
@@ -70,13 +70,13 @@ void GatherND8LayerTest::SetUp() {
auto ngDPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(dPrecision);
auto ngIPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(iPrecision);
auto params = ngraph::builder::makeParams(ngDPrc, {dataShape});
auto params = ngraph::builder::makeParams(ngDPrc, { dataShape });
auto paramOuts = ngraph::helpers::convert2OutputVector(
ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
auto dataNode = paramOuts[0];
auto gather = std::dynamic_pointer_cast<ngraph::opset8::GatherND>(
ngraph::builder::makeGatherND(dataNode, indicesShape, ngIPrc, batchDims));
ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(gather)};
ngraph::builder::makeGatherND8(dataNode, indicesShape, ngIPrc, batchDims));
ngraph::ResultVector results{ std::make_shared<ngraph::opset1::Result>(gather) };
function = std::make_shared<ngraph::Function>(results, params, "gatherND");
}

View File

@@ -531,6 +531,12 @@ std::shared_ptr<ngraph::Node> makeGatherND(
const element::Type& indicesType,
const std::size_t batchDims);
std::shared_ptr<ngraph::Node> makeGatherND8(
const ngraph::Output<Node>& dataNode,
const ngraph::Shape& indicesShape,
const element::Type& indicesType,
const std::size_t batchDims);
std::shared_ptr<ngraph::Node> makeTile(const ngraph::Output<Node>& in,
const std::vector<int64_t>& repeats);

View File

@@ -41,5 +41,34 @@ std::shared_ptr<Node> makeGatherND(
return gatherNdNode;
}
std::shared_ptr<Node> makeGatherND8(
const ngraph::Output<Node>& dataNode,
const ngraph::Shape& indicesShape,
const element::Type& indicesType,
const std::size_t batchDims) {
const auto indices = [&] {
const auto& dataShape = dataNode.get_shape();
const auto indicesCount = std::accumulate(begin(indicesShape), prev(end(indicesShape)),
1ull, std::multiplies<std::size_t>{});
const auto sliceRank = indicesShape.back();
const auto maxDim = *std::max_element(begin(dataShape), end(dataShape));
auto indicesValues = NGraphFunctions::Utils::generateVector<element::Type_t::i32>(indicesCount * sliceRank, maxDim, 0);
auto indicesData = indicesValues.data();
for (int i = 0; i < indicesCount; i++) {
for (int dim = 0; dim < sliceRank; dim++) {
indicesData[0] = indicesData[0] % dataShape[dim + batchDims];
indicesData++;
}
}
return opset8::Constant::create(indicesType, indicesShape, indicesValues);
}();
auto gatherNdNode = std::make_shared<opset8::GatherND>(dataNode, indices, batchDims);
gatherNdNode->set_friendly_name("GatherND");
return gatherNdNode;
}
} // namespace builder
} // namespace ngraph

View File

@@ -19,25 +19,37 @@ struct gather_nd : public primitive_base<gather_nd> {
CLDNN_DECLARE_PRIMITIVE(gather_nd)
/// @brief Constructs gather_nd primitive.
/// @param id This primitive id.
/// @param data Input data primitive id.
/// @param indices Input indexes primitive id.
/// @param indices_rank Rank of indices.
/// @param batch_dims batch_dims as an attribute of GatherND. Optional.
///
/// @param id This primitive id.
/// @param data Input data primitive id.
/// @param indices Input indexes primitive id.
/// @param indices_rank Rank of indices.
/// @param batch_dims batch_dims as an attribute of GatherND. Optional.
/// @param batch_merged_output batched output shape is merged as a dimention for v5.
/// In case of output{3, 2, 4, 5} at batch_dims = 2, real output shape should be {6, 4, 5}.
/// This should be false for v8.
/// For batch_dims < 2, This doesn't have any meaning.
gather_nd(const primitive_id& id,
const primitive_id& data,
const primitive_id& indices,
const uint8_t indices_rank,
const uint8_t batch_dims = 0,
const bool batch_merged_output = true,
const primitive_id& ext_prim_id = "",
const padding& output_padding = padding())
: primitive_base(id, {data, indices}, ext_prim_id, output_padding), indices_rank(indices_rank), batch_dims(batch_dims) {}
: primitive_base(id, {data, indices}, ext_prim_id, output_padding),
indices_rank(indices_rank),
batch_dims(batch_dims),
batch_merged_output(batch_merged_output) {}
/// @brief GatherND indices_rank
uint8_t indices_rank;
/// @brief GatherND batch_dims
uint8_t batch_dims;
/// @brief GatherND batch_merged_output
bool batch_merged_output;
};
/// @}
/// @}

View File

@@ -117,6 +117,7 @@ JitConstants GatherNDKernelRef::GetJitConstants(const gather_nd_params& params)
jit.AddConstant(MakeJitConstant("INDICES_RANK", params.indices_rank));
jit.AddConstant(MakeJitConstant("BATCH_DIMS", params.batch_dims));
jit.AddConstant(MakeJitConstant("BATCH_MERGED_OUTPUT", params.batch_merged_output));
jit.AddConstant(MakeJitConstant("WI_SLICE_SIZE", GetSliceSize(params)));
jit.AddConstant(MakeJitConstant("INDICES_LAST_DIM", GetIndicesLastDim(params)));

View File

@@ -11,11 +11,13 @@ namespace kernel_selector {
// gather_nd_params
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
struct gather_nd_params : public base_params {
gather_nd_params() : base_params(KernelType::GATHER_ND), indices_rank(0), batch_dims(0) {}
gather_nd_params() : base_params(KernelType::GATHER_ND), indices_rank(0), batch_dims(0), batch_merged_output(true) {}
uint8_t indices_rank;
uint8_t batch_dims;
bool batch_merged_output;
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

View File

@@ -125,28 +125,47 @@ KERNEL(gather_nd_ref)(const __global INPUT0_TYPE* data,
const uint out_f = idx_f;
const uint out_b = idx_b;
#else
uint pitch_acc = 1;
uint output_batch_size = 0;
for (int i = BATCH_DIMS - 1; i >= 0; i--) {
output_batch_size += (idx_arr[i] * pitch_acc);
pitch_acc *= idx_dim[i];
}
#if BATCH_MERGED_OUTPUT
uint pitch_acc = 1;
uint output_batch_size = 0;
for (int i = BATCH_DIMS - 1; i >= 0; i--) {
output_batch_size += (idx_arr[i] * pitch_acc);
pitch_acc *= idx_dim[i];
}
#if OUTPUT_DIMS == 4
const uint out_x = idx_arr[BATCH_DIMS+2];
const uint out_y = idx_arr[BATCH_DIMS+1];
#elif OUTPUT_DIMS == 5
const uint out_x = idx_arr[BATCH_DIMS+3];
const uint out_y = idx_arr[BATCH_DIMS+2];
const uint out_z = idx_arr[BATCH_DIMS+1];
#if OUTPUT_DIMS == 4
const uint out_x = idx_arr[BATCH_DIMS+2];
const uint out_y = idx_arr[BATCH_DIMS+1];
#elif OUTPUT_DIMS == 5
const uint out_x = idx_arr[BATCH_DIMS+3];
const uint out_y = idx_arr[BATCH_DIMS+2];
const uint out_z = idx_arr[BATCH_DIMS+1];
#else
const uint out_x = idx_arr[BATCH_DIMS+4];
const uint out_y = idx_arr[BATCH_DIMS+3];
const uint out_z = idx_arr[BATCH_DIMS+2];
const uint out_w = idx_arr[BATCH_DIMS+1];
#endif
const uint out_f = idx_arr[BATCH_DIMS+0];
const uint out_b = output_batch_size;
#else
const uint out_x = idx_arr[BATCH_DIMS+4];
const uint out_y = idx_arr[BATCH_DIMS+3];
const uint out_z = idx_arr[BATCH_DIMS+2];
const uint out_w = idx_arr[BATCH_DIMS+1];
#if OUTPUT_DIMS == 4
const uint out_x = idx_arr[3];
const uint out_y = idx_arr[2];
#elif OUTPUT_DIMS == 5
const uint out_x = idx_arr[4];
const uint out_y = idx_arr[3];
const uint out_z = idx_arr[2];
#else
const uint out_x = idx_arr[5];
const uint out_y = idx_arr[4];
const uint out_z = idx_arr[3];
const uint out_w = idx_arr[2];
#endif
const uint out_f = idx_arr[1];
const uint out_b = idx_arr[0];
#endif
const uint out_f = idx_arr[BATCH_DIMS+0];
const uint out_b = output_batch_size;
#endif
const uint output_idx = GET_OUTPUT_INDEX(OUT_ORDER);

View File

@@ -41,34 +41,41 @@ layout gather_nd_inst::calc_output_layout(gather_nd_node const& node) {
output_sizes.push_back(input_layout[x]);
}
// calculate batch_size by batch_dims
int batch_size = 1;
for (uint8_t x = 0; x < batch_dims; x++) {
batch_size *= output_sizes[x];
}
// create final output shape by batch_dims
std::vector<tensor::value_type> final_output_sizes;
if (batch_dims > 0) {
final_output_sizes.push_back(batch_size);
if (op->batch_merged_output) {
// calculate batch_size by batch_dims
int batch_size = 1;
for (uint8_t x = 0; x < batch_dims; x++) {
batch_size *= output_sizes[x];
}
if (batch_dims > 0) {
final_output_sizes.push_back(batch_size);
}
for (size_t x = static_cast<size_t>(batch_dims); x < output_sizes.size(); x++) {
final_output_sizes.push_back(output_sizes[x]);
}
} else {
for (size_t x = 0; x < output_sizes.size(); x++) {
final_output_sizes.push_back(output_sizes[x]);
}
}
for (size_t x = static_cast<size_t>(batch_dims); x < output_sizes.size(); x++) {
final_output_sizes.push_back(output_sizes[x]);
}
auto output_format = cldnn::format::bfyx;
if (final_output_sizes.size() >= 6) {
output_format = cldnn::format::bfwzyx;
auto output_format = cldnn::format::any;
if (final_output_sizes.size() <= 4) {
output_format = cldnn::format::bfyx;
} else if (final_output_sizes.size() == 5) {
output_format = cldnn::format::bfzyx;
} else {
output_format = cldnn::format::bfwzyx;
}
auto output_sizes_tensor = tensor(tensor(final_output_sizes).sizes(output_format));
auto padding = op->output_padding;
if (node.has_fused_primitives()) {
input_layout_origin.data_type = node.get_fused_output_layout().data_type;
}

View File

@@ -29,6 +29,7 @@ struct gather_nd_impl : typed_primitive_impl_ocl<gather_nd> {
gather_nd_params.indices_rank = arg.get_primitive()->indices_rank;
gather_nd_params.batch_dims = arg.get_primitive()->batch_dims;
gather_nd_params.batch_merged_output = arg.get_primitive()->batch_merged_output;
gather_nd_params.inputs.push_back(convert_data_tensor(arg.input(1).get_output_layout()));

View File

@@ -11,18 +11,20 @@
using namespace cldnn;
using namespace ::tests;
inline void DoTest(engine& engine,
inline void DoTestBase(engine& engine,
const cldnn::memory::ptr input0,
const cldnn::memory::ptr input1,
const std::vector<float>& expected_results,
const int indices_rank,
const int batch_dims) {
const int batch_dims,
const cldnn::format fmt,
const tensor ts,
const bool batch_merged_output) {
topology topology;
auto gather_nd_inst = gather_nd("gather_nd", "InputData", "InputIndices", indices_rank, batch_dims, batch_merged_output);
topology.add(input_layout("InputData", input0->get_layout()));
topology.add(input_layout("InputIndices", input1->get_layout()));
topology.add(
gather_nd("gather_nd", "InputData", "InputIndices", indices_rank, batch_dims)
);
topology.add(gather_nd_inst);
network network(engine, topology);
@@ -30,13 +32,54 @@ inline void DoTest(engine& engine,
network.set_input_data("InputIndices", input1);
auto outputs = network.execute();
auto output = outputs.at("gather_nd").get_memory();
cldnn::mem_lock<uint16_t> output_ptr(output, get_test_stream());
// Compare output shape
auto output_format = output->get_layout().format;
auto output_shape = output->get_layout().size;
EXPECT_EQ(fmt, output_format);
int32_t dim_size = 6;
if (fmt == format::bfyx) {
dim_size = 4;
} else if (fmt == format::bfzyx) {
dim_size = 5;
}
for (int32_t i = 0; i < dim_size; i++)
{
EXPECT_EQ(ts.sizes()[i], output_shape.sizes()[i]);
}
// Compare output value
cldnn::mem_lock<uint16_t> output_ptr(output, get_test_stream());
for (size_t i = 0; i < expected_results.size(); ++i) {
EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
}
}
inline void DoTestV5(engine& engine,
const cldnn::memory::ptr input0,
const cldnn::memory::ptr input1,
const std::vector<float>& expected_results,
const int indices_rank,
const int batch_dims,
const cldnn::format fmt,
const tensor size) {
DoTestBase(engine, input0, input1, expected_results, indices_rank, batch_dims, fmt, size, true);
}
inline void DoTestV8(engine& engine,
const cldnn::memory::ptr input0,
const cldnn::memory::ptr input1,
const std::vector<float>& expected_results,
const int indices_rank,
const int batch_dims,
const cldnn::format fmt,
const tensor size) {
DoTestBase(engine, input0, input1, expected_results, indices_rank, batch_dims, fmt, size, false);
}
TEST(gather_nd_gpu_fp16, d23322_i231312_ir6_batch2) {
auto& engine = get_test_engine();
@@ -44,7 +87,7 @@ TEST(gather_nd_gpu_fp16, d23322_i231312_ir6_batch2) {
const int batch_dims = 2;
auto input0 = engine.allocate_memory({ data_types::f16, format::bfzyx, { 2, 3, 2, 2, 3 } }); // data
auto input1 = engine.allocate_memory({ data_types::f16, format::bfwzyx, { 2, 3, 2, 1, 3, 1 } }); // indices
// expected output dim: {6,1,3,1,2}
// expected output dim: v5{6,1,3,1,2}, v8{2,3,1,3,1,2}
set_values(input0, {
FLOAT16(11), FLOAT16(12), FLOAT16(13), FLOAT16(14), FLOAT16(15), FLOAT16(16), FLOAT16(11), FLOAT16(12), FLOAT16(13), FLOAT16(14), FLOAT16(15), FLOAT16(16),
@@ -76,7 +119,8 @@ TEST(gather_nd_gpu_fp16, d23322_i231312_ir6_batch2) {
FLOAT16(31), FLOAT16(32), FLOAT16(35), FLOAT16(36), FLOAT16(33), FLOAT16(34),
};
DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfzyx, {6, 1, 2, 1, 3});
DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfwzyx, { 2, 3, 2, 1, 3, 1 });
}
TEST(gather_nd_gpu_fp16, d231322_i231321_ir6_batch5) {
@@ -86,7 +130,7 @@ TEST(gather_nd_gpu_fp16, d231322_i231321_ir6_batch5) {
const int batch_dims = 5;
auto input0 = engine.allocate_memory({ data_types::f16, format::bfwzyx, { 2, 3, 2, 2, 3, 1 } }); // data
auto input1 = engine.allocate_memory({ data_types::f16, format::bfwzyx, { 2, 3, 1, 2, 3, 1 } }); // indices
// expected output dim: {36}
// expected output dim: v5{36}, v8{2, 3, 2, 3, 1}
set_values(input0, {
FLOAT16(11), FLOAT16(12), FLOAT16(13), FLOAT16(14), FLOAT16(15), FLOAT16(16), FLOAT16(17), FLOAT16(18), FLOAT16(19), FLOAT16(10), FLOAT16(21), FLOAT16(18),
@@ -118,7 +162,8 @@ TEST(gather_nd_gpu_fp16, d231322_i231321_ir6_batch5) {
FLOAT16(32), FLOAT16(33), FLOAT16(35), FLOAT16(38), FLOAT16(30), FLOAT16(29),
};
DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, {36, 1, 1, 1});
DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfzyx, {2, 3, 2, 3, 1});
}
TEST(gather_nd_gpu_fp16, d23322_i23321_ir5_batch4) {
@@ -128,7 +173,7 @@ TEST(gather_nd_gpu_fp16, d23322_i23321_ir5_batch4) {
const int batch_dims = 4;
auto input0 = engine.allocate_memory({ data_types::f16, format::bfzyx, { 2, 3, 2, 2, 3 } }); // data
auto input1 = engine.allocate_memory({ data_types::f16, format::bfzyx, { 2, 3, 1, 2, 3 } }); // indices
// expected output dim: {36}
// expected output dim: v5{36}, v8{2,3,2,3}
set_values(input0, {
FLOAT16(11), FLOAT16(12), FLOAT16(13), FLOAT16(14), FLOAT16(15), FLOAT16(16), FLOAT16(17), FLOAT16(18), FLOAT16(19), FLOAT16(10), FLOAT16(21), FLOAT16(18),
@@ -160,9 +205,11 @@ TEST(gather_nd_gpu_fp16, d23322_i23321_ir5_batch4) {
FLOAT16(32), FLOAT16(33), FLOAT16(35), FLOAT16(38), FLOAT16(30), FLOAT16(29),
};
DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 36, 1, 1, 1 });
DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 3, 2, 3 });
}
TEST(gather_nd_gpu_fp16, d23223_i2321_ir4_batch3) {
auto& engine = get_test_engine();
@@ -170,7 +217,7 @@ TEST(gather_nd_gpu_fp16, d23223_i2321_ir4_batch3) {
const int batch_dims = 3;
auto input0 = engine.allocate_memory({ data_types::f16, format::bfzyx, { 2, 3, 3, 2, 2 } }); // data
auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 3, 1, 2 } }); // indices
// expected output dim: {2*3*2,3}
// expected output dim: v5{12,3} v8{2,3,3,2}
set_values(input0, {
FLOAT16(11), FLOAT16(12), FLOAT16(13), FLOAT16(14), FLOAT16(15), FLOAT16(16), FLOAT16(17), FLOAT16(18),FLOAT16(15), FLOAT16(16), FLOAT16(17), FLOAT16(18),
@@ -202,7 +249,8 @@ TEST(gather_nd_gpu_fp16, d23223_i2321_ir4_batch3) {
FLOAT16(29), FLOAT16(30), FLOAT16(31), FLOAT16(35), FLOAT16(36), FLOAT16(33),
};
DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 12, 3, 1, 1 });
DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 3, 3, 2 });
}
TEST(gather_nd_gpu_fp16, d2342_i2312_ir4_batch2) {
@@ -212,7 +260,7 @@ TEST(gather_nd_gpu_fp16, d2342_i2312_ir4_batch2) {
const int batch_dims = 2;
auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 3, 2, 4 } }); // data
auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 3, 2, 1 } }); // indices
// expected output dim: {6,1}
// expected output dim: v5{6,1}, v8(2,3,1)
set_values(input0, {
FLOAT16(11), FLOAT16(12), FLOAT16(13), FLOAT16(14), FLOAT16(15), FLOAT16(16), FLOAT16(17), FLOAT16(18),
@@ -244,7 +292,8 @@ TEST(gather_nd_gpu_fp16, d2342_i2312_ir4_batch2) {
FLOAT16(33),
};
DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 6, 1, 1, 1 });
DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 3, 1, 1 });
}
TEST(gather_nd_gpu_fp16, d234_i2311_ir4_batch2) {
@@ -254,7 +303,7 @@ TEST(gather_nd_gpu_fp16, d234_i2311_ir4_batch2) {
const int batch_dims = 2;
auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 3, 1, 4 } }); // data
auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 3, 1, 1 } }); // indices
// expected output dim: {6,1,1}
// expected output dim: v5{6,1,1}, v8{2,3,1,1}
set_values(input0, {
FLOAT16(1), FLOAT16(2), FLOAT16(3), FLOAT16(4),
@@ -287,7 +336,8 @@ TEST(gather_nd_gpu_fp16, d234_i2311_ir4_batch2) {
FLOAT16(23),
};
DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 6, 1, 1, 1 });
DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 3, 1, 1 });
}
TEST(gather_nd_gpu_fp16, d234_i21_ir2_batch1) {
@@ -297,7 +347,7 @@ TEST(gather_nd_gpu_fp16, d234_i21_ir2_batch1) {
const int batch_dims = 1;
auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 3, 1, 4 } }); // data
auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 1, 1, 1 } }); // indices
// expected output dim: {2,4}
// expected output dim: v5{2,4,1,1}, v8{2,4,1,1}
set_values(input0, {
FLOAT16(1), FLOAT16(2), FLOAT16(3), FLOAT16(4),
@@ -320,7 +370,8 @@ TEST(gather_nd_gpu_fp16, d234_i21_ir2_batch1) {
FLOAT16(13), FLOAT16(14), FLOAT16(15), FLOAT16(16),
};
DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 4, 1, 1 });
DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 4, 1, 1 });
}
TEST(gather_nd_gpu_fp16, d22_i21_ir2_batch1) {
@@ -330,7 +381,7 @@ TEST(gather_nd_gpu_fp16, d22_i21_ir2_batch1) {
const int batch_dims = 1;
auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 2, 1, 1 } }); // data
auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 1, 1, 1 } }); // indices
// expected output dim: 2
// expected output dim: v5{2,1,1}, v8{2,1,1}
set_values(input0, {
FLOAT16(1), FLOAT16(2),
@@ -347,7 +398,8 @@ TEST(gather_nd_gpu_fp16, d22_i21_ir2_batch1) {
FLOAT16(3),
};
DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 1, 1, 1 });
DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 1, 1, 1, 1 });
}
TEST(gather_nd_gpu_fp16, d3223_i321113_ir6_batch0) {
@@ -357,7 +409,7 @@ TEST(gather_nd_gpu_fp16, d3223_i321113_ir6_batch0) {
const int batch_dims = 0;
auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 3, 2, 3, 2 } }); // data
auto input1 = engine.allocate_memory({ data_types::f16, format::bfwzyx, { 3, 2, 3, 1, 1, 1 } }); // indices
// expected output dim: 321113
// expected output dim: 323111
set_values(input0, {
FLOAT16(11), FLOAT16(12), FLOAT16(13), FLOAT16(14), FLOAT16(15), FLOAT16(16),
@@ -392,7 +444,8 @@ TEST(gather_nd_gpu_fp16, d3223_i321113_ir6_batch0) {
FLOAT16(11), FLOAT16(12), FLOAT16(13),
};
DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfwzyx, { 3, 2, 3, 1, 1, 1 });
DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfwzyx, { 3, 2, 3, 1, 1, 1 });
}
TEST(gather_nd_gpu_fp16, d3221_i32312_ir3_batch0) {
@@ -402,7 +455,7 @@ TEST(gather_nd_gpu_fp16, d3221_i32312_ir3_batch0) {
const int batch_dims = 0;
auto input0 = engine.allocate_memory({ data_types::f16, format::bfzyx, { 3, 2, 2, 1, 3 } }); // data
auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 3, 2, 1, 2 } }); // indices
// expected output dim: 32312
// expected output dim: 32213
set_values(input0, {
FLOAT16(11), FLOAT16(12), FLOAT16(13), FLOAT16(14), FLOAT16(15), FLOAT16(16),
@@ -437,7 +490,8 @@ TEST(gather_nd_gpu_fp16, d3221_i32312_ir3_batch0) {
FLOAT16(11), FLOAT16(12), FLOAT16(13), FLOAT16(14), FLOAT16(15), FLOAT16(16),
};
DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfzyx, { 3, 2, 2, 1, 3 });
DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfzyx, { 3, 2, 2, 1, 3 });
}
TEST(gather_nd_gpu_fp16, d3231_i32312_ir3_batch0) {
@@ -447,7 +501,7 @@ TEST(gather_nd_gpu_fp16, d3231_i32312_ir3_batch0) {
const int batch_dims = 0;
auto input0 = engine.allocate_memory({ data_types::f16, format::bfzyx, { 3, 2, 2, 1, 3 } }); // data
auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 3, 2, 1, 3 } }); // indices
// expected output dim: {3,2,1,2}
// expected output dim: {3,2,2,1}
set_values(input0, {
FLOAT16(11), FLOAT16(12), FLOAT16(13), FLOAT16(14), FLOAT16(15), FLOAT16(16),
@@ -482,7 +536,8 @@ TEST(gather_nd_gpu_fp16, d3231_i32312_ir3_batch0) {
FLOAT16(11), FLOAT16(12),
};
DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 3, 2, 2, 1 });
DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 3, 2, 2, 1 });
}
TEST(gather_nd_gpu_fp16, d3112_i3221_ir4_batch0) {
@@ -522,47 +577,8 @@ TEST(gather_nd_gpu_fp16, d3112_i3221_ir4_batch0) {
FLOAT16(1), FLOAT16(2), FLOAT16(7), FLOAT16(8),
};
DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
}
TEST(gather_nd_gpu_fp16, d311211_i322111_ir4_batch0) {
auto& engine = get_test_engine();
const int indices_rank = 4;
const int batch_dims = 0;
auto input0 = engine.allocate_memory({ data_types::f16, format::bfwzyx, { 3, 1, 1, 1, 2, 1 } }); // data
auto input1 = engine.allocate_memory({ data_types::f16, format::bfwzyx, { 3, 2, 1, 1, 1, 2 } }); // indices
// expected output dim: {3,2,2,1,1,2,1,1}
set_values(input0, {
FLOAT16(1), FLOAT16(2),
FLOAT16(7), FLOAT16(8),
FLOAT16(13), FLOAT16(14),
});
set_values(input1, {
FLOAT16(2), FLOAT16(1),
FLOAT16(0), FLOAT16(1),
FLOAT16(2), FLOAT16(1),
FLOAT16(0), FLOAT16(1),
FLOAT16(2), FLOAT16(1),
FLOAT16(0), FLOAT16(1),
});
std::vector<float> expected_results = {
FLOAT16(13), FLOAT16(14), FLOAT16(7), FLOAT16(8),
FLOAT16(1), FLOAT16(2), FLOAT16(7), FLOAT16(8),
FLOAT16(13), FLOAT16(14), FLOAT16(7), FLOAT16(8),
FLOAT16(1), FLOAT16(2), FLOAT16(7), FLOAT16(8),
FLOAT16(13), FLOAT16(14), FLOAT16(7), FLOAT16(8),
FLOAT16(1), FLOAT16(2), FLOAT16(7), FLOAT16(8),
};
DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfwzyx, { 3, 2, 2, 1, 1, 2 });
DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfwzyx, { 3, 2, 2, 1, 1, 2 });
}
TEST(gather_nd_gpu_fp16, d3332_i3223_ir4_batch0) {
@@ -572,6 +588,7 @@ TEST(gather_nd_gpu_fp16, d3332_i3223_ir4_batch0) {
const int batch_dims = 0;
auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 3, 3, 3, 2 } }); // data
auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 3, 2, 3, 2 } }); // indices
// expected output dim: {3,2,3,2}
set_values(input0, {
FLOAT16(1), FLOAT16(2), FLOAT16(3), FLOAT16(4), FLOAT16(5), FLOAT16(6),
@@ -609,7 +626,8 @@ TEST(gather_nd_gpu_fp16, d3332_i3223_ir4_batch0) {
FLOAT16(34), FLOAT16(35), FLOAT16(36), FLOAT16(16), FLOAT16(17), FLOAT16(18),
};
DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 3, 2, 3, 2 });
DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 3, 2, 3, 2 });
}
TEST(gather_nd_gpu_fp16, d3323_i322_ir3_batch0) {
@@ -619,6 +637,7 @@ TEST(gather_nd_gpu_fp16, d3323_i322_ir3_batch0) {
const int batch_dims = 0;
auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 3, 3, 3, 2 } }); // data
auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 3, 2, 1, 2 } }); // indices
// expected output dim: {3,2,3,2}
set_values(input0, {
FLOAT16(1), FLOAT16(2), FLOAT16(3), FLOAT16(4), FLOAT16(5), FLOAT16(6),
@@ -656,7 +675,8 @@ TEST(gather_nd_gpu_fp16, d3323_i322_ir3_batch0) {
FLOAT16(13), FLOAT16(14), FLOAT16(15), FLOAT16(16), FLOAT16(17), FLOAT16(18),
};
DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 3, 2, 3, 2 });
DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 3, 2, 3, 2 });
}
TEST(gather_nd_gpu_fp16, d22_i21_ir2_batch0) {
@@ -666,6 +686,7 @@ TEST(gather_nd_gpu_fp16, d22_i21_ir2_batch0) {
const int batch_dims = 0;
auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 2, 1, 1 } }); // data
auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 1, 1, 1 } }); // indices
// expected output dim: {2,2,1,1}
set_values(input0, {
FLOAT16(1), FLOAT16(2),
@@ -681,7 +702,8 @@ TEST(gather_nd_gpu_fp16, d22_i21_ir2_batch0) {
FLOAT16(1), FLOAT16(2),
};
DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 2, 1, 1 });
DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 2, 1, 1 });
}
TEST(gather_nd_gpu_fp16, d22_i32_ir2_batch0) {
@@ -691,6 +713,7 @@ TEST(gather_nd_gpu_fp16, d22_i32_ir2_batch0) {
const int batch_dims = 0;
auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 2, 1, 1 } }); // data
auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 3, 2, 1, 1 } }); // indices
// expected output dim: {3,1,1}
set_values(input0, {
FLOAT16(1), FLOAT16(2),
@@ -709,5 +732,6 @@ TEST(gather_nd_gpu_fp16, d22_i32_ir2_batch0) {
FLOAT16(4),
};
DoTest(engine,input0, input1, expected_results, indices_rank, batch_dims);
DoTestV5(engine,input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 3, 1, 1, 1 });
DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 3, 1, 1, 1 });
}