[GPU] Add new operation GatherND-8 (#8586)

Signed-off-by: Kelvin Choi <kelvin.choi@intel.com>
2021-11-18 13:24:02 +09:00
parent 1dbb8910e3
commit 517cf8dff6
14 changed files with 297 additions and 141 deletions
--- a/inference-engine/src/cldnn_engine/cldnn_primitives_list.hpp
+++ b/inference-engine/src/cldnn_engine/cldnn_primitives_list.hpp
@@ -211,6 +211,6 @@ REGISTER_FACTORY(v7, Gather);

 // ------------------------------ Supported v8 ops ------------------------------ //
 REGISTER_FACTORY(v8, Gather);
-
+REGISTER_FACTORY(v8, GatherND);
 // --------------------------- Supported internal ops --------------------------- //
 REGISTER_FACTORY(internal, NonMaxSuppressionIEInternal);
--- a/inference-engine/src/cldnn_engine/ops/gather_nd.cpp
+++ b/inference-engine/src/cldnn_engine/ops/gather_nd.cpp
@@ -26,6 +26,7 @@ static void CreateGatherNDOp(Program& p, const std::shared_ptr<ngraph::op::v5::G
                                      inputPrimitives[1],
                                      indices_rank,
                                      batch_dims,
+                                      true,
                                      op->get_friendly_name());

    p.AddPrimitive(primitive);
@@ -34,4 +35,27 @@ static void CreateGatherNDOp(Program& p, const std::shared_ptr<ngraph::op::v5::G

 REGISTER_FACTORY_IMPL(v5, GatherND);

+static void CreateGatherNDOp(Program& p, const std::shared_ptr<ngraph::op::v8::GatherND>& op) {
+    p.ValidateInputs(op, { 2 });
+    auto inputPrimitives = p.GetInputPrimitiveIDs(op);
+    std::string layerName = layer_type_name_ID(op);
+
+    int32_t indices_rank = static_cast<int32_t>(op->get_input_shape(1).size());
+
+    auto batch_dims = op->get_batch_dims();
+
+    auto primitive = cldnn::gather_nd(layerName,
+                                      inputPrimitives[0],
+                                      inputPrimitives[1],
+                                      indices_rank,
+                                      batch_dims,
+                                      false,
+                                      op->get_friendly_name());
+
+    p.AddPrimitive(primitive);
+    p.AddPrimitiveToProfiler(op);
+}
+
+REGISTER_FACTORY_IMPL(v8, GatherND);
+
 }  // namespace CLDNNPlugin
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/gather_nd.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/gather_nd.cpp
@@ -3,7 +3,6 @@
 //

 #include <vector>
-#include <ngraph/opsets/opset5.hpp>

 #include "single_layer_tests/gather_nd.hpp"
 #include "common_test_utils/test_constants.hpp"
@@ -33,15 +32,6 @@ const auto gatherNDArgsSubset1 = ::testing::Combine(
    ::testing::ValuesIn(std::vector<int>({ 0, 1 }))             // Batch dims
 );

-INSTANTIATE_TEST_SUITE_P(smoke_GatherND_set1, GatherNDLayerTest,
-    ::testing::Combine(
-        gatherNDArgsSubset1,
-        ::testing::ValuesIn(inputPrecisions),
-        ::testing::ValuesIn(idxPrecisions),
-        ::testing::Values(CommonTestUtils::DEVICE_GPU),
-        ::testing::Values<Config>({})),
-    GatherNDLayerTest::getTestCaseName);
-
 // set2
 const auto gatherNDArgsSubset2 = ::testing::Combine(
    ::testing::ValuesIn(std::vector<std::vector<size_t>>(
@@ -51,15 +41,6 @@ const auto gatherNDArgsSubset2 = ::testing::Combine(
    ::testing::ValuesIn(std::vector<int>({ 1, 2 }))             // Batch dims
 );

-INSTANTIATE_TEST_SUITE_P(smoke_GatherND_set2, GatherNDLayerTest,
-    ::testing::Combine(
-        gatherNDArgsSubset2,
-        ::testing::ValuesIn(inputPrecisions),
-        ::testing::ValuesIn(idxPrecisions),
-        ::testing::Values(CommonTestUtils::DEVICE_GPU),
-        ::testing::Values<Config>({})),
-    GatherNDLayerTest::getTestCaseName);
-
 // set3
 const auto gatherNDArgsSubset3 = ::testing::Combine(
    ::testing::ValuesIn(std::vector<std::vector<size_t>>(
@@ -69,7 +50,27 @@ const auto gatherNDArgsSubset3 = ::testing::Combine(
    ::testing::ValuesIn(std::vector<int>({ 3, 4 }))             // Batch dims
 );

-INSTANTIATE_TEST_SUITE_P(smoke_GatherND_set3, GatherNDLayerTest,
+
+// -------------------------------- V5 --------------------------------
+INSTANTIATE_TEST_SUITE_P(smoke_GatherND5_set1, GatherNDLayerTest,
+    ::testing::Combine(
+        gatherNDArgsSubset1,
+        ::testing::ValuesIn(inputPrecisions),
+        ::testing::ValuesIn(idxPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_GPU),
+        ::testing::Values<Config>({})),
+    GatherNDLayerTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_GatherND5_set2, GatherNDLayerTest,
+    ::testing::Combine(
+        gatherNDArgsSubset2,
+        ::testing::ValuesIn(inputPrecisions),
+        ::testing::ValuesIn(idxPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_GPU),
+        ::testing::Values<Config>({})),
+    GatherNDLayerTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_GatherND5_set3, GatherNDLayerTest,
    ::testing::Combine(
        gatherNDArgsSubset3,
        ::testing::ValuesIn(inputPrecisions),
@@ -78,4 +79,32 @@ INSTANTIATE_TEST_SUITE_P(smoke_GatherND_set3, GatherNDLayerTest,
        ::testing::Values<Config>({})),
    GatherNDLayerTest::getTestCaseName);

+// -------------------------------- V8 --------------------------------
+INSTANTIATE_TEST_SUITE_P(smoke_GatherND8_set1, GatherND8LayerTest,
+    ::testing::Combine(
+        gatherNDArgsSubset1,
+        ::testing::ValuesIn(inputPrecisions),
+        ::testing::ValuesIn(idxPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_GPU),
+        ::testing::Values<Config>({})),
+    GatherND8LayerTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_GatherND8_set2, GatherND8LayerTest,
+    ::testing::Combine(
+        gatherNDArgsSubset2,
+        ::testing::ValuesIn(inputPrecisions),
+        ::testing::ValuesIn(idxPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_GPU),
+        ::testing::Values<Config>({})),
+    GatherND8LayerTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_GatherND8_set3, GatherND8LayerTest,
+    ::testing::Combine(
+        gatherNDArgsSubset3,
+        ::testing::ValuesIn(inputPrecisions),
+        ::testing::ValuesIn(idxPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_GPU),
+        ::testing::Values<Config>({})),
+    GatherND8LayerTest::getTestCaseName);
+
 }  // namespace
--- a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/gather_nd.hpp
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/gather_nd.hpp
@@ -9,6 +9,8 @@
 #include <vector>

 #include "shared_test_classes/base/layer_test_utils.hpp"
+#include "ngraph_functions/builders.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"

 namespace LayerTestsDefinitions {
 using Config = std::map<std::string, std::string>;
--- a/inference-engine/tests/functional/shared_test_classes/src/single_layer/gather_nd.cpp
+++ b/inference-engine/tests/functional/shared_test_classes/src/single_layer/gather_nd.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "ngraph_functions/builders.hpp"
 #include "shared_test_classes/single_layer/gather_nd.hpp"

 namespace LayerTestsDefinitions {
@@ -55,6 +54,7 @@ void GatherNDLayerTest::SetUp() {
    function = std::make_shared<ngraph::Function>(results, params, "gatherND");
 }

+
 std::string GatherND8LayerTest::getTestCaseName(const testing::TestParamInfo<GatherNDParams>& obj) {
    return GatherNDLayerTest::getTestCaseName(obj);
 }
@@ -70,13 +70,13 @@ void GatherND8LayerTest::SetUp() {
    auto ngDPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(dPrecision);
    auto ngIPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(iPrecision);

-    auto params = ngraph::builder::makeParams(ngDPrc, {dataShape});
+    auto params = ngraph::builder::makeParams(ngDPrc, { dataShape });
    auto paramOuts = ngraph::helpers::convert2OutputVector(
-                ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
+        ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
    auto dataNode = paramOuts[0];
    auto gather = std::dynamic_pointer_cast<ngraph::opset8::GatherND>(
-                ngraph::builder::makeGatherND(dataNode, indicesShape, ngIPrc, batchDims));
-    ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(gather)};
+        ngraph::builder::makeGatherND8(dataNode, indicesShape, ngIPrc, batchDims));
+    ngraph::ResultVector results{ std::make_shared<ngraph::opset1::Result>(gather) };
    function = std::make_shared<ngraph::Function>(results, params, "gatherND");
 }

--- a/inference-engine/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/builders.hpp
+++ b/inference-engine/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/builders.hpp
@@ -531,6 +531,12 @@ std::shared_ptr<ngraph::Node> makeGatherND(
                                      const element::Type& indicesType,
                                      const std::size_t batchDims);

+std::shared_ptr<ngraph::Node> makeGatherND8(
+                                      const ngraph::Output<Node>& dataNode,
+                                      const ngraph::Shape& indicesShape,
+                                      const element::Type& indicesType,
+                                      const std::size_t batchDims);
+
 std::shared_ptr<ngraph::Node> makeTile(const ngraph::Output<Node>& in,
                                       const std::vector<int64_t>& repeats);

--- a/inference-engine/tests/ngraph_helpers/ngraph_functions/src/gather_nd.cpp
+++ b/inference-engine/tests/ngraph_helpers/ngraph_functions/src/gather_nd.cpp
@@ -41,5 +41,34 @@ std::shared_ptr<Node> makeGatherND(
    return gatherNdNode;
 }

+std::shared_ptr<Node> makeGatherND8(
+    const ngraph::Output<Node>& dataNode,
+    const ngraph::Shape& indicesShape,
+    const element::Type& indicesType,
+    const std::size_t batchDims) {
+    const auto indices = [&] {
+        const auto& dataShape = dataNode.get_shape();
+        const auto indicesCount = std::accumulate(begin(indicesShape), prev(end(indicesShape)),
+            1ull, std::multiplies<std::size_t>{});
+        const auto sliceRank = indicesShape.back();
+
+        const auto maxDim = *std::max_element(begin(dataShape), end(dataShape));
+
+        auto indicesValues = NGraphFunctions::Utils::generateVector<element::Type_t::i32>(indicesCount * sliceRank, maxDim, 0);
+        auto indicesData = indicesValues.data();
+        for (int i = 0; i < indicesCount; i++) {
+            for (int dim = 0; dim < sliceRank; dim++) {
+                indicesData[0] = indicesData[0] % dataShape[dim + batchDims];
+                indicesData++;
+            }
+        }
+        return opset8::Constant::create(indicesType, indicesShape, indicesValues);
+    }();
+
+    auto gatherNdNode = std::make_shared<opset8::GatherND>(dataNode, indices, batchDims);
+    gatherNdNode->set_friendly_name("GatherND");
+
+    return gatherNdNode;
+}
 }  // namespace builder
 }  // namespace ngraph
--- a/inference-engine/thirdparty/clDNN/api/cldnn/primitives/gather_nd.hpp
+++ b/inference-engine/thirdparty/clDNN/api/cldnn/primitives/gather_nd.hpp
@@ -19,25 +19,37 @@ struct gather_nd : public primitive_base<gather_nd> {
    CLDNN_DECLARE_PRIMITIVE(gather_nd)

    /// @brief Constructs gather_nd primitive.
-    /// @param id This primitive id.
-    /// @param data Input data primitive id.
-    /// @param indices Input indexes primitive id.
-    /// @param indices_rank Rank of indices.
-    /// @param batch_dims batch_dims as an attribute of GatherND. Optional.
+    ///
+    /// @param id                   This primitive id.
+    /// @param data                 Input data primitive id.
+    /// @param indices              Input indexes primitive id.
+    /// @param indices_rank         Rank of indices.
+    /// @param batch_dims           batch_dims as an attribute of GatherND. Optional.
+    /// @param batch_merged_output  batched output shape is merged as a dimention for v5.
+    ///                             In case of output{3, 2, 4, 5} at batch_dims = 2, real output shape should be {6, 4, 5}.
+    ///                             This should be false for v8.
+    ///                             For batch_dims < 2, This doesn't have any meaning.
    gather_nd(const primitive_id& id,
              const primitive_id& data,
              const primitive_id& indices,
              const uint8_t indices_rank,
              const uint8_t batch_dims = 0,
+              const bool batch_merged_output = true,
              const primitive_id& ext_prim_id = "",
              const padding& output_padding = padding())
-        : primitive_base(id, {data, indices}, ext_prim_id, output_padding), indices_rank(indices_rank), batch_dims(batch_dims) {}
+        : primitive_base(id, {data, indices}, ext_prim_id, output_padding),
+                         indices_rank(indices_rank),
+                         batch_dims(batch_dims),
+                         batch_merged_output(batch_merged_output) {}

    /// @brief GatherND indices_rank
    uint8_t indices_rank;

    /// @brief GatherND batch_dims
    uint8_t batch_dims;
+
+    /// @brief GatherND batch_merged_output
+    bool batch_merged_output;
 };
 /// @}
 /// @}
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_nd_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_nd_kernel_ref.cpp
@@ -117,6 +117,7 @@ JitConstants GatherNDKernelRef::GetJitConstants(const gather_nd_params& params)

    jit.AddConstant(MakeJitConstant("INDICES_RANK", params.indices_rank));
    jit.AddConstant(MakeJitConstant("BATCH_DIMS", params.batch_dims));
+    jit.AddConstant(MakeJitConstant("BATCH_MERGED_OUTPUT", params.batch_merged_output));
    jit.AddConstant(MakeJitConstant("WI_SLICE_SIZE", GetSliceSize(params)));
    jit.AddConstant(MakeJitConstant("INDICES_LAST_DIM", GetIndicesLastDim(params)));

--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_nd_kernel_ref.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_nd_kernel_ref.h
@@ -11,11 +11,13 @@ namespace kernel_selector {
 // gather_nd_params
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 struct gather_nd_params : public base_params {
-    gather_nd_params() : base_params(KernelType::GATHER_ND), indices_rank(0), batch_dims(0) {}
+    gather_nd_params() : base_params(KernelType::GATHER_ND), indices_rank(0), batch_dims(0), batch_merged_output(true) {}

    uint8_t indices_rank;

    uint8_t batch_dims;
+
+    bool batch_merged_output;
 };

 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gather_nd_ref.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gather_nd_ref.cl
@@ -125,28 +125,47 @@ KERNEL(gather_nd_ref)(const __global INPUT0_TYPE* data,
        const uint out_f = idx_f;
        const uint out_b = idx_b;
    #else
-        uint pitch_acc = 1;
-        uint output_batch_size = 0;
-        for (int i = BATCH_DIMS - 1; i >= 0; i--) {
-            output_batch_size += (idx_arr[i] * pitch_acc);
-            pitch_acc *= idx_dim[i];
-        }
+        #if BATCH_MERGED_OUTPUT
+            uint pitch_acc = 1;
+            uint output_batch_size = 0;
+            for (int i = BATCH_DIMS - 1; i >= 0; i--) {
+                output_batch_size += (idx_arr[i] * pitch_acc);
+                pitch_acc *= idx_dim[i];
+            }

-        #if OUTPUT_DIMS == 4
-            const uint out_x = idx_arr[BATCH_DIMS+2];
-            const uint out_y = idx_arr[BATCH_DIMS+1];
-        #elif OUTPUT_DIMS == 5
-            const uint out_x = idx_arr[BATCH_DIMS+3];
-            const uint out_y = idx_arr[BATCH_DIMS+2];
-            const uint out_z = idx_arr[BATCH_DIMS+1];
+            #if OUTPUT_DIMS == 4
+                const uint out_x = idx_arr[BATCH_DIMS+2];
+                const uint out_y = idx_arr[BATCH_DIMS+1];
+            #elif OUTPUT_DIMS == 5
+                const uint out_x = idx_arr[BATCH_DIMS+3];
+                const uint out_y = idx_arr[BATCH_DIMS+2];
+                const uint out_z = idx_arr[BATCH_DIMS+1];
+            #else
+                const uint out_x = idx_arr[BATCH_DIMS+4];
+                const uint out_y = idx_arr[BATCH_DIMS+3];
+                const uint out_z = idx_arr[BATCH_DIMS+2];
+                const uint out_w = idx_arr[BATCH_DIMS+1];
+            #endif
+            const uint out_f = idx_arr[BATCH_DIMS+0];
+            const uint out_b = output_batch_size;
        #else
-            const uint out_x = idx_arr[BATCH_DIMS+4];
-            const uint out_y = idx_arr[BATCH_DIMS+3];
-            const uint out_z = idx_arr[BATCH_DIMS+2];
-            const uint out_w = idx_arr[BATCH_DIMS+1];
+            #if OUTPUT_DIMS == 4
+                const uint out_x = idx_arr[3];
+                const uint out_y = idx_arr[2];
+            #elif OUTPUT_DIMS == 5
+                const uint out_x = idx_arr[4];
+                const uint out_y = idx_arr[3];
+                const uint out_z = idx_arr[2];
+            #else
+                const uint out_x = idx_arr[5];
+                const uint out_y = idx_arr[4];
+                const uint out_z = idx_arr[3];
+                const uint out_w = idx_arr[2];
+            #endif
+            const uint out_f = idx_arr[1];
+            const uint out_b = idx_arr[0];
+
        #endif
-        const uint out_f = idx_arr[BATCH_DIMS+0];
-        const uint out_b = output_batch_size;
    #endif

    const uint output_idx = GET_OUTPUT_INDEX(OUT_ORDER);
--- a/inference-engine/thirdparty/clDNN/src/gather_nd.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gather_nd.cpp
@@ -41,34 +41,41 @@ layout gather_nd_inst::calc_output_layout(gather_nd_node const& node) {
        output_sizes.push_back(input_layout[x]);
    }

-    // calculate batch_size by batch_dims
-    int batch_size = 1;
-    for (uint8_t x = 0; x < batch_dims; x++) {
-        batch_size *= output_sizes[x];
-    }
-
    // create final output shape by batch_dims
    std::vector<tensor::value_type> final_output_sizes;

-    if (batch_dims > 0) {
-        final_output_sizes.push_back(batch_size);
+    if (op->batch_merged_output) {
+        // calculate batch_size by batch_dims
+        int batch_size = 1;
+        for (uint8_t x = 0; x < batch_dims; x++) {
+            batch_size *= output_sizes[x];
+        }
+
+        if (batch_dims > 0) {
+            final_output_sizes.push_back(batch_size);
+        }
+
+        for (size_t x = static_cast<size_t>(batch_dims); x < output_sizes.size(); x++) {
+            final_output_sizes.push_back(output_sizes[x]);
+        }
+    } else {
+        for (size_t x = 0; x < output_sizes.size(); x++) {
+            final_output_sizes.push_back(output_sizes[x]);
+        }
    }

-    for (size_t x = static_cast<size_t>(batch_dims); x < output_sizes.size(); x++) {
-        final_output_sizes.push_back(output_sizes[x]);
-    }
-
-    auto output_format = cldnn::format::bfyx;
-    if (final_output_sizes.size() >= 6) {
-        output_format = cldnn::format::bfwzyx;
+    auto output_format = cldnn::format::any;
+    if (final_output_sizes.size() <= 4) {
+        output_format = cldnn::format::bfyx;
    } else if (final_output_sizes.size() == 5) {
        output_format = cldnn::format::bfzyx;
+    } else {
+        output_format = cldnn::format::bfwzyx;
    }

    auto output_sizes_tensor = tensor(tensor(final_output_sizes).sizes(output_format));
    auto padding = op->output_padding;

-
    if (node.has_fused_primitives()) {
        input_layout_origin.data_type = node.get_fused_output_layout().data_type;
    }
--- a/inference-engine/thirdparty/clDNN/src/impls/ocl/gather_nd.cpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/ocl/gather_nd.cpp
@@ -29,6 +29,7 @@ struct gather_nd_impl : typed_primitive_impl_ocl<gather_nd> {

        gather_nd_params.indices_rank = arg.get_primitive()->indices_rank;
        gather_nd_params.batch_dims = arg.get_primitive()->batch_dims;
+        gather_nd_params.batch_merged_output = arg.get_primitive()->batch_merged_output;

        gather_nd_params.inputs.push_back(convert_data_tensor(arg.input(1).get_output_layout()));

--- a/inference-engine/thirdparty/clDNN/tests/test_cases/gather_nd_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/gather_nd_gpu_test.cpp
@@ -11,18 +11,20 @@
 using namespace cldnn;
 using namespace ::tests;

-inline void DoTest(engine& engine,
+inline void DoTestBase(engine& engine,
    const cldnn::memory::ptr input0,
    const cldnn::memory::ptr input1,
    const std::vector<float>& expected_results,
    const int indices_rank,
-    const int batch_dims) {
+    const int batch_dims,
+    const cldnn::format fmt,
+    const tensor ts,
+    const bool batch_merged_output) {
    topology topology;
+    auto gather_nd_inst = gather_nd("gather_nd", "InputData", "InputIndices", indices_rank, batch_dims, batch_merged_output);
    topology.add(input_layout("InputData", input0->get_layout()));
    topology.add(input_layout("InputIndices", input1->get_layout()));
-    topology.add(
-        gather_nd("gather_nd", "InputData", "InputIndices", indices_rank, batch_dims)
-    );
+    topology.add(gather_nd_inst);

    network network(engine, topology);

@@ -30,13 +32,54 @@ inline void DoTest(engine& engine,
    network.set_input_data("InputIndices", input1);
    auto outputs = network.execute();
    auto output = outputs.at("gather_nd").get_memory();
-    cldnn::mem_lock<uint16_t> output_ptr(output, get_test_stream());

+    // Compare output shape
+    auto output_format = output->get_layout().format;
+    auto output_shape = output->get_layout().size;
+
+    EXPECT_EQ(fmt, output_format);
+
+    int32_t dim_size = 6;
+    if (fmt == format::bfyx) {
+        dim_size = 4;
+    } else if (fmt == format::bfzyx) {
+        dim_size = 5;
+    }
+
+    for (int32_t i = 0; i < dim_size; i++)
+    {
+        EXPECT_EQ(ts.sizes()[i], output_shape.sizes()[i]);
+    }
+
+    // Compare output value
+    cldnn::mem_lock<uint16_t> output_ptr(output, get_test_stream());
    for (size_t i = 0; i < expected_results.size(); ++i) {
        EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
    }
 }

+inline void DoTestV5(engine& engine,
+    const cldnn::memory::ptr input0,
+    const cldnn::memory::ptr input1,
+    const std::vector<float>& expected_results,
+    const int indices_rank,
+    const int batch_dims,
+    const cldnn::format fmt,
+    const tensor size) {
+    DoTestBase(engine, input0, input1, expected_results, indices_rank, batch_dims, fmt, size, true);
+}
+
+inline void DoTestV8(engine& engine,
+    const cldnn::memory::ptr input0,
+    const cldnn::memory::ptr input1,
+    const std::vector<float>& expected_results,
+    const int indices_rank,
+    const int batch_dims,
+    const cldnn::format fmt,
+    const tensor size) {
+    DoTestBase(engine, input0, input1, expected_results, indices_rank, batch_dims, fmt, size, false);
+}
+
 TEST(gather_nd_gpu_fp16, d23322_i231312_ir6_batch2) {
    auto& engine = get_test_engine();

@@ -44,7 +87,7 @@ TEST(gather_nd_gpu_fp16, d23322_i231312_ir6_batch2) {
    const int batch_dims = 2;
    auto input0 = engine.allocate_memory({ data_types::f16, format::bfzyx, { 2, 3, 2, 2, 3 } }); // data
    auto input1 = engine.allocate_memory({ data_types::f16, format::bfwzyx, { 2, 3, 2, 1, 3, 1 } }); // indices
-    // expected output dim: {6,1,3,1,2}
+    // expected output dim: v5{6,1,3,1,2}, v8{2,3,1,3,1,2}

    set_values(input0, {
        FLOAT16(11), FLOAT16(12),  FLOAT16(13), FLOAT16(14),    FLOAT16(15), FLOAT16(16),  FLOAT16(11), FLOAT16(12),    FLOAT16(13), FLOAT16(14),  FLOAT16(15), FLOAT16(16),
@@ -76,7 +119,8 @@ TEST(gather_nd_gpu_fp16, d23322_i231312_ir6_batch2) {
        FLOAT16(31), FLOAT16(32),   FLOAT16(35), FLOAT16(36),   FLOAT16(33), FLOAT16(34),
    };

-    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+    DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfzyx, {6, 1, 2, 1, 3});
+    DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfwzyx, { 2, 3, 2, 1, 3, 1 });
 }

 TEST(gather_nd_gpu_fp16, d231322_i231321_ir6_batch5) {
@@ -86,7 +130,7 @@ TEST(gather_nd_gpu_fp16, d231322_i231321_ir6_batch5) {
    const int batch_dims = 5;
    auto input0 = engine.allocate_memory({ data_types::f16, format::bfwzyx, { 2, 3, 2, 2, 3, 1 } }); // data
    auto input1 = engine.allocate_memory({ data_types::f16, format::bfwzyx, { 2, 3, 1, 2, 3, 1 } }); // indices
-    // expected output dim: {36}
+    // expected output dim: v5{36}, v8{2, 3, 2, 3, 1}

    set_values(input0, {
        FLOAT16(11), FLOAT16(12),   FLOAT16(13), FLOAT16(14),   FLOAT16(15), FLOAT16(16),   FLOAT16(17), FLOAT16(18),   FLOAT16(19), FLOAT16(10),   FLOAT16(21), FLOAT16(18),
@@ -118,7 +162,8 @@ TEST(gather_nd_gpu_fp16, d231322_i231321_ir6_batch5) {
        FLOAT16(32), FLOAT16(33),   FLOAT16(35), FLOAT16(38),   FLOAT16(30), FLOAT16(29),
    };

-    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+    DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, {36, 1, 1, 1});
+    DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfzyx, {2, 3, 2, 3, 1});
 }

 TEST(gather_nd_gpu_fp16, d23322_i23321_ir5_batch4) {
@@ -128,7 +173,7 @@ TEST(gather_nd_gpu_fp16, d23322_i23321_ir5_batch4) {
    const int batch_dims = 4;
    auto input0 = engine.allocate_memory({ data_types::f16, format::bfzyx, { 2, 3, 2, 2, 3 } }); // data
    auto input1 = engine.allocate_memory({ data_types::f16, format::bfzyx, { 2, 3, 1, 2, 3 } }); // indices
-    // expected output dim: {36}
+    // expected output dim: v5{36}, v8{2,3,2,3}

    set_values(input0, {
        FLOAT16(11), FLOAT16(12),   FLOAT16(13), FLOAT16(14),   FLOAT16(15), FLOAT16(16),   FLOAT16(17), FLOAT16(18),   FLOAT16(19), FLOAT16(10),   FLOAT16(21), FLOAT16(18),
@@ -160,9 +205,11 @@ TEST(gather_nd_gpu_fp16, d23322_i23321_ir5_batch4) {
        FLOAT16(32), FLOAT16(33),   FLOAT16(35), FLOAT16(38),   FLOAT16(30), FLOAT16(29),
    };

-    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+    DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 36, 1, 1, 1 });
+    DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 3, 2, 3 });
 }

+
 TEST(gather_nd_gpu_fp16, d23223_i2321_ir4_batch3) {
    auto& engine = get_test_engine();

@@ -170,7 +217,7 @@ TEST(gather_nd_gpu_fp16, d23223_i2321_ir4_batch3) {
    const int batch_dims = 3;
    auto input0 = engine.allocate_memory({ data_types::f16, format::bfzyx, { 2, 3, 3, 2, 2 } }); // data
    auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 3, 1, 2 } }); // indices
-    // expected output dim: {2*3*2,3}
+    // expected output dim: v5{12,3} v8{2,3,3,2}

    set_values(input0, {
        FLOAT16(11), FLOAT16(12), FLOAT16(13),  FLOAT16(14), FLOAT16(15), FLOAT16(16),  FLOAT16(17), FLOAT16(18),FLOAT16(15),  FLOAT16(16), FLOAT16(17), FLOAT16(18),
@@ -202,7 +249,8 @@ TEST(gather_nd_gpu_fp16, d23223_i2321_ir4_batch3) {
        FLOAT16(29), FLOAT16(30), FLOAT16(31),  FLOAT16(35), FLOAT16(36), FLOAT16(33),
    };

-    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+    DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 12, 3, 1, 1 });
+    DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 3, 3, 2 });
 }

 TEST(gather_nd_gpu_fp16, d2342_i2312_ir4_batch2) {
@@ -212,7 +260,7 @@ TEST(gather_nd_gpu_fp16, d2342_i2312_ir4_batch2) {
    const int batch_dims = 2;
    auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 3, 2, 4 } }); // data
    auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 3, 2, 1 } }); // indices
-    // expected output dim: {6,1}
+    // expected output dim: v5{6,1}, v8(2,3,1)

    set_values(input0, {
        FLOAT16(11), FLOAT16(12),   FLOAT16(13), FLOAT16(14),   FLOAT16(15), FLOAT16(16),   FLOAT16(17), FLOAT16(18),
@@ -244,7 +292,8 @@ TEST(gather_nd_gpu_fp16, d2342_i2312_ir4_batch2) {
        FLOAT16(33),
    };

-    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+    DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 6, 1, 1, 1 });
+    DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 3, 1, 1 });
 }

 TEST(gather_nd_gpu_fp16, d234_i2311_ir4_batch2) {
@@ -254,7 +303,7 @@ TEST(gather_nd_gpu_fp16, d234_i2311_ir4_batch2) {
    const int batch_dims = 2;
    auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 3, 1, 4 } }); // data
    auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 3, 1, 1 } }); // indices
-    // expected output dim: {6,1,1}
+    // expected output dim: v5{6,1,1}, v8{2,3,1,1}

    set_values(input0, {
        FLOAT16(1), FLOAT16(2), FLOAT16(3), FLOAT16(4),
@@ -287,7 +336,8 @@ TEST(gather_nd_gpu_fp16, d234_i2311_ir4_batch2) {
        FLOAT16(23),
    };

-    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+    DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 6, 1, 1, 1 });
+    DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 3, 1, 1 });
 }

 TEST(gather_nd_gpu_fp16, d234_i21_ir2_batch1) {
@@ -297,7 +347,7 @@ TEST(gather_nd_gpu_fp16, d234_i21_ir2_batch1) {
    const int batch_dims = 1;
    auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 3, 1, 4 } }); // data
    auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 1, 1, 1 } }); // indices
-    // expected output dim: {2,4}
+    // expected output dim: v5{2,4,1,1}, v8{2,4,1,1}

    set_values(input0, {
        FLOAT16(1), FLOAT16(2), FLOAT16(3), FLOAT16(4),
@@ -320,7 +370,8 @@ TEST(gather_nd_gpu_fp16, d234_i21_ir2_batch1) {
        FLOAT16(13), FLOAT16(14), FLOAT16(15), FLOAT16(16),
    };

-    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+    DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 4, 1, 1 });
+    DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 4, 1, 1 });
 }

 TEST(gather_nd_gpu_fp16, d22_i21_ir2_batch1) {
@@ -330,7 +381,7 @@ TEST(gather_nd_gpu_fp16, d22_i21_ir2_batch1) {
    const int batch_dims = 1;
    auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 2, 1, 1 } }); // data
    auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 1, 1, 1 } }); // indices
-    // expected output dim: 2
+    // expected output dim: v5{2,1,1}, v8{2,1,1}

    set_values(input0, {
        FLOAT16(1), FLOAT16(2),
@@ -347,7 +398,8 @@ TEST(gather_nd_gpu_fp16, d22_i21_ir2_batch1) {
        FLOAT16(3),
    };

-    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+    DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 1, 1, 1 });
+    DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 1, 1, 1, 1 });
 }

 TEST(gather_nd_gpu_fp16, d3223_i321113_ir6_batch0) {
@@ -357,7 +409,7 @@ TEST(gather_nd_gpu_fp16, d3223_i321113_ir6_batch0) {
    const int batch_dims = 0;
    auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 3, 2, 3, 2 } }); // data
    auto input1 = engine.allocate_memory({ data_types::f16, format::bfwzyx, { 3, 2, 3, 1, 1, 1 } }); // indices
-    // expected output dim: 321113
+    // expected output dim: 323111

    set_values(input0, {
        FLOAT16(11), FLOAT16(12), FLOAT16(13),   FLOAT16(14), FLOAT16(15), FLOAT16(16),
@@ -392,7 +444,8 @@ TEST(gather_nd_gpu_fp16, d3223_i321113_ir6_batch0) {
        FLOAT16(11), FLOAT16(12), FLOAT16(13),
    };

-    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+    DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfwzyx, { 3, 2, 3, 1, 1, 1 });
+    DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfwzyx, { 3, 2, 3, 1, 1, 1 });
 }

 TEST(gather_nd_gpu_fp16, d3221_i32312_ir3_batch0) {
@@ -402,7 +455,7 @@ TEST(gather_nd_gpu_fp16, d3221_i32312_ir3_batch0) {
    const int batch_dims = 0;
    auto input0 = engine.allocate_memory({ data_types::f16, format::bfzyx, { 3, 2, 2, 1, 3 } }); // data
    auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 3, 2, 1, 2 } }); // indices
-    // expected output dim: 32312
+    // expected output dim: 32213

    set_values(input0, {
        FLOAT16(11), FLOAT16(12),     FLOAT16(13), FLOAT16(14),     FLOAT16(15), FLOAT16(16),
@@ -437,7 +490,8 @@ TEST(gather_nd_gpu_fp16, d3221_i32312_ir3_batch0) {
        FLOAT16(11), FLOAT16(12),     FLOAT16(13), FLOAT16(14),     FLOAT16(15), FLOAT16(16),
    };

-    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+    DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfzyx, { 3, 2, 2, 1, 3 });
+    DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfzyx, { 3, 2, 2, 1, 3 });
 }

 TEST(gather_nd_gpu_fp16, d3231_i32312_ir3_batch0) {
@@ -447,7 +501,7 @@ TEST(gather_nd_gpu_fp16, d3231_i32312_ir3_batch0) {
    const int batch_dims = 0;
    auto input0 = engine.allocate_memory({ data_types::f16, format::bfzyx, { 3, 2, 2, 1, 3 } }); // data
    auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 3, 2, 1, 3 } }); // indices
-    // expected output dim: {3,2,1,2}
+    // expected output dim: {3,2,2,1}

    set_values(input0, {
        FLOAT16(11), FLOAT16(12),     FLOAT16(13), FLOAT16(14),     FLOAT16(15), FLOAT16(16),
@@ -482,7 +536,8 @@ TEST(gather_nd_gpu_fp16, d3231_i32312_ir3_batch0) {
        FLOAT16(11), FLOAT16(12),
    };

-    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+    DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 3, 2, 2, 1 });
+    DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 3, 2, 2, 1 });
 }

 TEST(gather_nd_gpu_fp16, d3112_i3221_ir4_batch0) {
@@ -522,47 +577,8 @@ TEST(gather_nd_gpu_fp16, d3112_i3221_ir4_batch0) {
        FLOAT16(1), FLOAT16(2),         FLOAT16(7), FLOAT16(8),
    };

-    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
-}
-
-TEST(gather_nd_gpu_fp16, d311211_i322111_ir4_batch0) {
-    auto& engine = get_test_engine();
-
-    const int indices_rank = 4;
-    const int batch_dims = 0;
-    auto input0 = engine.allocate_memory({ data_types::f16, format::bfwzyx, { 3, 1, 1, 1, 2, 1 } }); // data
-    auto input1 = engine.allocate_memory({ data_types::f16, format::bfwzyx, { 3, 2, 1, 1, 1, 2 } }); // indices
-    // expected output dim: {3,2,2,1,1,2,1,1}
-
-    set_values(input0, {
-        FLOAT16(1), FLOAT16(2),
-        FLOAT16(7), FLOAT16(8),
-        FLOAT16(13), FLOAT16(14),
-    });
-
-    set_values(input1, {
-        FLOAT16(2), FLOAT16(1),
-        FLOAT16(0), FLOAT16(1),
-
-        FLOAT16(2), FLOAT16(1),
-        FLOAT16(0), FLOAT16(1),
-
-        FLOAT16(2), FLOAT16(1),
-        FLOAT16(0), FLOAT16(1),
-    });
-
-    std::vector<float> expected_results = {
-        FLOAT16(13), FLOAT16(14),       FLOAT16(7), FLOAT16(8),
-        FLOAT16(1), FLOAT16(2),         FLOAT16(7), FLOAT16(8),
-
-        FLOAT16(13), FLOAT16(14),       FLOAT16(7), FLOAT16(8),
-        FLOAT16(1), FLOAT16(2),         FLOAT16(7), FLOAT16(8),
-
-        FLOAT16(13), FLOAT16(14),       FLOAT16(7), FLOAT16(8),
-        FLOAT16(1), FLOAT16(2),         FLOAT16(7), FLOAT16(8),
-    };
-
-    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+    DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfwzyx, { 3, 2, 2, 1, 1, 2 });
+    DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfwzyx, { 3, 2, 2, 1, 1, 2 });
 }

 TEST(gather_nd_gpu_fp16, d3332_i3223_ir4_batch0) {
@@ -572,6 +588,7 @@ TEST(gather_nd_gpu_fp16, d3332_i3223_ir4_batch0) {
    const int batch_dims = 0;
    auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 3, 3, 3, 2 } }); // data
    auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 3, 2, 3, 2 } }); // indices
+    // expected output dim: {3,2,3,2}

    set_values(input0, {
        FLOAT16(1), FLOAT16(2), FLOAT16(3),     FLOAT16(4), FLOAT16(5), FLOAT16(6),
@@ -609,7 +626,8 @@ TEST(gather_nd_gpu_fp16, d3332_i3223_ir4_batch0) {
        FLOAT16(34), FLOAT16(35), FLOAT16(36),      FLOAT16(16), FLOAT16(17), FLOAT16(18),
    };

-    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+    DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 3, 2, 3, 2 });
+    DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 3, 2, 3, 2 });
 }

 TEST(gather_nd_gpu_fp16, d3323_i322_ir3_batch0) {
@@ -619,6 +637,7 @@ TEST(gather_nd_gpu_fp16, d3323_i322_ir3_batch0) {
    const int batch_dims = 0;
    auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 3, 3, 3, 2 } }); // data
    auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 3, 2, 1, 2 } }); // indices
+    // expected output dim: {3,2,3,2}

    set_values(input0, {
        FLOAT16(1), FLOAT16(2), FLOAT16(3),     FLOAT16(4), FLOAT16(5), FLOAT16(6),
@@ -656,7 +675,8 @@ TEST(gather_nd_gpu_fp16, d3323_i322_ir3_batch0) {
        FLOAT16(13), FLOAT16(14), FLOAT16(15),     FLOAT16(16), FLOAT16(17), FLOAT16(18),
    };

-    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+    DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 3, 2, 3, 2 });
+    DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 3, 2, 3, 2 });
 }

 TEST(gather_nd_gpu_fp16, d22_i21_ir2_batch0) {
@@ -666,6 +686,7 @@ TEST(gather_nd_gpu_fp16, d22_i21_ir2_batch0) {
    const int batch_dims = 0;
    auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 2, 1, 1 } }); // data
    auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 1, 1, 1 } }); // indices
+    // expected output dim: {2,2,1,1}

    set_values(input0, {
        FLOAT16(1), FLOAT16(2),
@@ -681,7 +702,8 @@ TEST(gather_nd_gpu_fp16, d22_i21_ir2_batch0) {
        FLOAT16(1), FLOAT16(2),
    };

-    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+    DoTestV5(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 2, 1, 1 });
+    DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 2, 2, 1, 1 });
 }

 TEST(gather_nd_gpu_fp16, d22_i32_ir2_batch0) {
@@ -691,6 +713,7 @@ TEST(gather_nd_gpu_fp16, d22_i32_ir2_batch0) {
    const int batch_dims = 0;
    auto input0 = engine.allocate_memory({ data_types::f16, format::bfyx, { 2, 2, 1, 1 } }); // data
    auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, { 3, 2, 1, 1 } }); // indices
+    // expected output dim: {3,1,1}

    set_values(input0, {
        FLOAT16(1), FLOAT16(2),
@@ -709,5 +732,6 @@ TEST(gather_nd_gpu_fp16, d22_i32_ir2_batch0) {
        FLOAT16(4),
    };

-    DoTest(engine,input0, input1, expected_results, indices_rank, batch_dims);
+    DoTestV5(engine,input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 3, 1, 1, 1 });
+    DoTestV8(engine, input0, input1, expected_results, indices_rank, batch_dims, format::bfyx, { 3, 1, 1, 1 });
 }