diff --git a/inference-engine/src/cldnn_engine/cldnn_program.cpp b/inference-engine/src/cldnn_engine/cldnn_program.cpp index 86cdddb81ef..d8a15fa4174 100644 --- a/inference-engine/src/cldnn_engine/cldnn_program.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_program.cpp @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -598,6 +599,9 @@ Program::LayerType Program::LayerTypeFromStr(const std::string &str) { { "CTCGreedyDecoder", CTCGreedyDecoder }, { "PriorBoxClustered", PriorBoxClustered }, { "CumSum", CumSum }, + { "EmbeddingBagPackedSum", EmbeddingBagPackedSum }, + { "EmbeddingBagOffsetsSum", EmbeddingBagOffsetsSum }, + { "EmbeddingSegmentsSum", EmbeddingSegmentsSum }, }; auto it = LayerNameToType.find(str); if (it != LayerNameToType.end()) @@ -1280,6 +1284,12 @@ void Program::CreateSingleLayerPrimitive(cldnn::topology& topology, InferenceEng break; case CumSum: CreateCumSumPrimitive(topology, layer); break; + case EmbeddingBagPackedSum: CreateEmbeddingBagPackedSumPrimitive(topology, layer); + break; + case EmbeddingBagOffsetsSum: CreateEmbeddingBagOffsetsSumPrimitive(topology, layer); + break; + case EmbeddingSegmentsSum: CreateEmbeddingSegmentsSumPrimitive(topology, layer); + break; default: THROW_CLDNN_EXCEPTION("Unknown Layer Type: " << layer->type); } } @@ -4487,6 +4497,95 @@ void Program::CreatePriorBoxClusteredPrimitive(cldnn::topology& topology, Infere AddPrimitiveToProfiler(priorBoxLayerName, layer); } +void Program::CreateEmbeddingBagPackedSumPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr& layer) { + ValidateLayer(layer, {2, 3}); + + auto inputPrimitives = GetPrevLayersPrimitives(layer); + auto embeddingBag = as(layer); + + auto layerName = layer_type_name_ID(layer); + auto embeddingBagPrim = cldnn::embedding_bag( + layerName, + inputPrimitives, + cldnn::embedding_bag::packed_sum, + CldnnTensorFromIEDims(embeddingBag->outData[0]->getTensorDesc().getDims())); + + topology.add(embeddingBagPrim); + AddPrimitiveToProfiler(layerName, layer); +} + +void Program::CreateEmbeddingBagOffsetsSumPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr& layer) { + ValidateLayer(layer, {3, 4, 5}); + + auto inputPrimitives = GetPrevLayersPrimitives(layer); + auto embeddingBag = as(layer); + + int32_t defaultIndex = -1; + if (inputPrimitives.size() > 3) { + auto defaultIndexInput = layer->insData[3].lock(); + auto defaultIndexInputCreator = defaultIndexInput->getCreatorLayer().lock(); + if (defaultIndexInputCreator->blobs.size() == 1) { + auto constantBlob = defaultIndexInputCreator->blobs.begin()->second; + auto defaultIndexPrecision = constantBlob->getTensorDesc().getPrecision(); + if (defaultIndexPrecision == InferenceEngine::Precision::I32) { + auto data = constantBlob->buffer().as(); + defaultIndex = data[0]; + } else { + THROW_IE_EXCEPTION << layer->name << "Incorrect EmbeddingBagOfsetsSum default_index precision"; + } + } + inputPrimitives.erase(inputPrimitives.begin() + 3); // Remove "default_index" + } + + auto layerName = layer_type_name_ID(layer); + auto embeddingBagPrim = cldnn::embedding_bag( + layerName, + inputPrimitives, + cldnn::embedding_bag::offsets_sum, + CldnnTensorFromIEDims(embeddingBag->outData[0]->getTensorDesc().getDims()), + defaultIndex); + + topology.add(embeddingBagPrim); + AddPrimitiveToProfiler(layerName, layer); +} + +void Program::CreateEmbeddingSegmentsSumPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr& layer) { + ValidateLayer(layer, {4, 5, 6}); + + auto inputPrimitives = GetPrevLayersPrimitives(layer); + auto embeddingBag = as(layer); + + inputPrimitives.erase(inputPrimitives.begin() + 3); // Remove "num_segments" + + int32_t defaultIndex = -1; + if (inputPrimitives.size() > 3) { + auto defaultIndexInput = layer->insData[4].lock(); + auto defaultIndexInputCreator = defaultIndexInput->getCreatorLayer().lock(); + if (defaultIndexInputCreator->blobs.size() == 1) { + auto constantBlob = defaultIndexInputCreator->blobs.begin()->second; + auto defaultIndexPrecision = constantBlob->getTensorDesc().getPrecision(); + if (defaultIndexPrecision == InferenceEngine::Precision::I32) { + auto data = constantBlob->buffer().as(); + defaultIndex = data[0]; + } else { + THROW_IE_EXCEPTION << layer->name << "Incorrect EmbeddingBagOfsetsSum default_index precision"; + } + } + inputPrimitives.erase(inputPrimitives.begin() + 3); // Remove "default_index" + } + + auto layerName = layer_type_name_ID(layer); + auto embeddingBagPrim = cldnn::embedding_bag( + layerName, + inputPrimitives, + cldnn::embedding_bag::segments_sum, + CldnnTensorFromIEDims(embeddingBag->outData[0]->getTensorDesc().getDims()), + defaultIndex); + + topology.add(embeddingBagPrim); + AddPrimitiveToProfiler(layerName, layer); +} + bool Program::IsValidSplitConvMerge(const InferenceEngine::SplitLayer *splitLayer) const { if (splitLayer->outData.size() != 2) return false; // split into 2 diff --git a/inference-engine/src/cldnn_engine/cldnn_program.h b/inference-engine/src/cldnn_engine/cldnn_program.h index 18b6150f3cd..588d7dd36a8 100644 --- a/inference-engine/src/cldnn_engine/cldnn_program.h +++ b/inference-engine/src/cldnn_engine/cldnn_program.h @@ -217,6 +217,9 @@ public: CTCGreedyDecoder, PriorBoxClustered, CumSum, + EmbeddingBagPackedSum, + EmbeddingBagOffsetsSum, + EmbeddingSegmentsSum, NO_TYPE }; using GenericBlobMap = std::map; @@ -373,6 +376,9 @@ private: void CreateCTCGreedyDecoderPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr& layer); void CreatePriorBoxClusteredPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr& layer); void CreateCumSumPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr& layer); + void CreateEmbeddingBagPackedSumPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr& layer); + void CreateEmbeddingBagOffsetsSumPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr& layer); + void CreateEmbeddingSegmentsSumPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr& layer); }; } // namespace CLDNNPlugin diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/embedding_bag_offsets_sum.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/embedding_bag_offsets_sum.cpp new file mode 100644 index 00000000000..0557fb59fa3 --- /dev/null +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/embedding_bag_offsets_sum.cpp @@ -0,0 +1,59 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "single_layer_tests/embedding_bag_offsets_sum.hpp" + +#include + +#include "common_test_utils/test_constants.hpp" + +using namespace LayerTestsDefinitions; + +namespace { + +const std::vector netPrecisions = { + InferenceEngine::Precision::FP32, + InferenceEngine::Precision::FP16 +}; + +const std::vector indPrecisions = { + InferenceEngine::Precision::I64, + InferenceEngine::Precision::I32 +}; + +const std::vector> emb_table_shape = { + {5, 6}, + {10, 35}, + {5, 4, 16} +}; + +const std::vector> indices = { + {0, 1, 2, 2, 3}, + {4, 4, 3, 1, 0}, + {1, 2, 1, 2, 1, 2, 1, 2, 1, 2} +}; + +const std::vector> offsets = { + {0, 2}, + {0, 0, 2, 2}, + {2, 4} +}; + +const std::vector default_index = {0, 4}; +const std::vector with_weights = {false, true}; +const std::vector with_default_index = {false, true}; + +const auto embBagOffsetSumArgSet = ::testing::Combine( + ::testing::ValuesIn(emb_table_shape), ::testing::ValuesIn(indices), + ::testing::ValuesIn(offsets), ::testing::ValuesIn(default_index), + ::testing::ValuesIn(with_weights), ::testing::ValuesIn(with_default_index)); + +INSTANTIATE_TEST_CASE_P( + smoke, EmbeddingBagOffsetsSumLayerTest, + ::testing::Combine(embBagOffsetSumArgSet, + ::testing::ValuesIn(netPrecisions), + ::testing::ValuesIn(indPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_GPU)), + EmbeddingBagOffsetsSumLayerTest::getTestCaseName); +} // namespace diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/embedding_bag_packed_sum.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/embedding_bag_packed_sum.cpp new file mode 100644 index 00000000000..c2aa5ef2d58 --- /dev/null +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/embedding_bag_packed_sum.cpp @@ -0,0 +1,49 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "single_layer_tests/embedding_bag_packed_sum.hpp" + +#include + +#include "common_test_utils/test_constants.hpp" + +using namespace LayerTestsDefinitions; + +namespace { + +const std::vector netPrecisions = { + InferenceEngine::Precision::FP32, + InferenceEngine::Precision::FP16, +}; + +const std::vector indPrecisions = { + InferenceEngine::Precision::I64, + InferenceEngine::Precision::I32 +}; + +const std::vector> emb_table_shape = { + {5, 6}, + {10, 35}, + {5, 4, 16} +}; + +const std::vector>> indices = { + {{0, 1}, {2, 2}, {3, 4}}, + {{4, 4, 3}, {1, 0, 2}}, + {{1, 2, 1, 2}, {1, 2, 1, 2}} +}; +const std::vector with_weights = {false, true}; + +const auto embBagPackedSumArgSet = ::testing::Combine( + ::testing::ValuesIn(emb_table_shape), ::testing::ValuesIn(indices), + ::testing::ValuesIn(with_weights)); + +INSTANTIATE_TEST_CASE_P( + smoke, EmbeddingBagPackedSumLayerTest, + ::testing::Combine(embBagPackedSumArgSet, + ::testing::ValuesIn(netPrecisions), + ::testing::ValuesIn(indPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_GPU)), + EmbeddingBagPackedSumLayerTest::getTestCaseName); +} // namespace diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/embedding_segments_sum.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/embedding_segments_sum.cpp new file mode 100644 index 00000000000..14b5f7133b3 --- /dev/null +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/embedding_segments_sum.cpp @@ -0,0 +1,54 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "single_layer_tests/embedding_segments_sum.hpp" +#include "common_test_utils/test_constants.hpp" + +using namespace LayerTestsDefinitions; + +namespace { + +const std::vector netPrecisions = { + InferenceEngine::Precision::FP32, + InferenceEngine::Precision::FP16 +}; + +const std::vector indPrecisions = { + InferenceEngine::Precision::I64, + InferenceEngine::Precision::I32 +}; + +const std::vector> emb_table_shape = { + {5, 6}, + {10, 35}, + {5, 4, 16} +}; +const std::vector> indices = { + {0, 1, 2, 2, 3}, + {4, 4, 3, 1, 2} +}; +const std::vector> segment_ids = { + {0, 1, 2, 3, 4}, + {0, 0, 2, 2, 4} +}; +const std::vector num_segments = {5, 7}; +const std::vector default_index = {0, 4}; +const std::vector with_weights = {false, true}; +const std::vector with_default_index = {false, true}; + +const auto embSegmentsSumArgSet = ::testing::Combine( + ::testing::ValuesIn(emb_table_shape), ::testing::ValuesIn(indices), + ::testing::ValuesIn(segment_ids), ::testing::ValuesIn(num_segments), + ::testing::ValuesIn(default_index), ::testing::ValuesIn(with_weights), + ::testing::ValuesIn(with_default_index)); + +INSTANTIATE_TEST_CASE_P( + smoke, EmbeddingSegmentsSumLayerTest, + ::testing::Combine(embSegmentsSumArgSet, ::testing::ValuesIn(netPrecisions), + ::testing::ValuesIn(indPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_GPU)), + EmbeddingSegmentsSumLayerTest::getTestCaseName); +} // namespace diff --git a/inference-engine/thirdparty/clDNN/api/embedding_bag.hpp b/inference-engine/thirdparty/clDNN/api/embedding_bag.hpp new file mode 100644 index 00000000000..2dba4f4eeb9 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api/embedding_bag.hpp @@ -0,0 +1,64 @@ +/* +// Copyright (c) 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once +#include "primitive.hpp" + +namespace cldnn { +/// @addtogroup cpp_api C++ API +/// @{ +/// @addtogroup cpp_topology Network Topology +/// @{ +/// @addtogroup cpp_primitives Primitives +/// @{ + +/// @brief Computes sums of "bags" of embeddings, without instantiating the intermediate embeddings. +/// @details For each index in `indices` this operator gets values from `data` embedding table and sums all values belonging to each bag. +struct embedding_bag : public primitive_base { + CLDNN_DECLARE_PRIMITIVE(embedding_bag) + + /// @brief Select type of embedding_bag operation + enum embedding_bag_type { + packed_sum, + offsets_sum, + segments_sum + }; + + /// @brief Constructs embedding_bag primitive. + /// @param id This primitive id. + /// @param inputs Vector with different inputs. + /// @param output_shape Tensor with shape of output layout + /// @param default_index default index in embedding table to fill empty "bags" + embedding_bag(const primitive_id& id, + const std::vector& inputs, + const embedding_bag_type& type, + const tensor& output_shape, + const int32_t default_index = -1, + const padding& output_padding = padding()) + : primitive_base(id, inputs, output_padding), type(type), output_shape(output_shape), default_index(default_index) {} + + /// @brief Type of EmbeddingBag operation + embedding_bag_type type; + /// @brief Shape of output layout + tensor output_shape; + /// @brief Default index + int32_t default_index; +}; +/// @} +/// @} +/// @} +} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h index 0dff12d9e2b..ab1ee211ff6 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h @@ -82,7 +82,8 @@ enum class KernelType { SPACE_TO_DEPTH, GRN, CTC_GREEDY_DECODER, - CUM_SUM + CUM_SUM, + EMBEDDING_BAG }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -532,4 +533,13 @@ enum class CumSumAxis { FEATURE, BATCH }; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// EmbeddingBagType +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +enum class EmbeddingBagType { + PACKED_SUM, + OFFSETS_SUM, + SEGMENTS_SUM +}; } // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.cpp new file mode 100644 index 00000000000..333ecc2a212 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.cpp @@ -0,0 +1,140 @@ +/* +// Copyright (c) 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "embedding_bag_kernel_ref.h" +#include "kernel_selector_utils.h" +#include +#include + +namespace kernel_selector { +JitConstants EmbeddingBagKernelRef::GetJitConstants(const embedding_bag_params& params) const { + JitConstants jit = MakeBaseParamsJitConstants(params); + + switch (params.type) { + case EmbeddingBagType::PACKED_SUM: + jit.AddConstant(MakeJitConstant("PACKED_SUM", 1)); + break; + case EmbeddingBagType::OFFSETS_SUM: + jit.AddConstant(MakeJitConstant("OFFSETS_SUM", 1)); + break; + case EmbeddingBagType::SEGMENTS_SUM: + jit.AddConstant(MakeJitConstant("SEGMENTS_SUM", 1)); + break; + default: + break; + } + if (params.default_index > -1) + jit.AddConstant(MakeJitConstant("DEFAULT_INDEX", params.default_index)); + + return jit; +} + +CommonDispatchData EmbeddingBagKernelRef::SetDefault(const embedding_bag_params& params) const { + CommonDispatchData runInfo; + + std::vector global = { params.output.Batch().v, + params.output.Feature().v, + params.output.Y().v * params.output.X().v }; + + auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo); + + runInfo.gws0 = global[0]; + runInfo.gws1 = global[1]; + runInfo.gws2 = global[2]; + + runInfo.lws0 = local[0]; + runInfo.lws1 = local[1]; + runInfo.lws2 = local[2]; + + return runInfo; +} + +KernelsData EmbeddingBagKernelRef::GetKernelsData(const Params& params, const optional_params& options) const { + KernelData kd = KernelData::Default(params); + embedding_bag_params& newParams = *static_cast(kd.params.get()); + + if (!Validate(params, options)) { + return {}; + } + + auto runInfo = SetDefault(newParams); + auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options); + auto cldnn_jit = GetJitConstants(newParams); + std::string jit = CreateJit(kernelName, cldnn_jit, entry_point); + + auto& kernel = kd.kernels[0]; + + FillCLKernelData(kernel, + runInfo, + params.engineInfo, + kernelName, + jit, + entry_point, + "", + false, + false, + (uint32_t)newParams.inputs.size()); + + kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE; + + return { kd }; +} + +ParamsKey EmbeddingBagKernelRef::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableInputDataType(Datatype::INT32); + k.EnableInputDataType(Datatype::INT64); + k.EnableInputDataType(Datatype::UINT32); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + + k.EnableInputLayout(DataLayout::bfxy); + + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDifferentTypes(); + return k; +} + +bool EmbeddingBagKernelRef::Validate(const Params& p, const optional_params& o) const { + if (p.GetType() != KernelType::EMBEDDING_BAG || + o.GetType() != KernelType::EMBEDDING_BAG) { + return false; + } + const embedding_bag_params& params = static_cast(p); + + auto checkIntType = [](Datatype dt) { + if (dt != Datatype::INT32 && dt != Datatype::UINT32) + return false; + return true; + }; + + if (!checkIntType(params.inputs[1].GetDType())) + return false; + + if (params.type == EmbeddingBagType::OFFSETS_SUM || params.type == EmbeddingBagType::SEGMENTS_SUM) { + if (!checkIntType(params.inputs[2].GetDType())) + return false; + } + + return true; +} +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.h new file mode 100644 index 00000000000..3333d79ec26 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.h @@ -0,0 +1,58 @@ +/* +// Copyright (c) 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "common_kernel_base.h" +#include "kernel_selector_params.h" + +namespace kernel_selector { +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// embedding_bag_params +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +struct embedding_bag_params : public base_params { + embedding_bag_params() : base_params(KernelType::EMBEDDING_BAG), type(EmbeddingBagType::PACKED_SUM), default_index(-1) {} + + EmbeddingBagType type; + int32_t default_index; + + virtual ParamsKey GetParamsKey() const { return base_params::GetParamsKey(); } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// embedding_bag_optional_params +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +struct embedding_bag_optional_params : optional_params { + embedding_bag_optional_params() : optional_params(KernelType::EMBEDDING_BAG) {} +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// EmbeddingBagKernelRef +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +class EmbeddingBagKernelRef : public common_kernel_base { +public: + EmbeddingBagKernelRef() : common_kernel_base("embedding_bag_ref") {} + virtual ~EmbeddingBagKernelRef() = default; + +protected: + KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + ParamsKey GetSupportedKey() const override; + + virtual JitConstants GetJitConstants(const embedding_bag_params& params) const; + virtual CommonDispatchData SetDefault(const embedding_bag_params& params) const; + virtual bool Validate(const Params&, const optional_params&) const override; +}; +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_selector.cpp new file mode 100644 index 00000000000..14f8253f819 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_selector.cpp @@ -0,0 +1,29 @@ +/* +// Copyright (c) 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "embedding_bag_kernel_selector.h" +#include "embedding_bag_kernel_ref.h" + +namespace kernel_selector { + +embedding_bag_kernel_selector::embedding_bag_kernel_selector() { + Attach(); +} + +KernelsData embedding_bag_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const { + return GetNaiveBestKernel(params, options, KernelType::EMBEDDING_BAG); +} +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_selector.h new file mode 100644 index 00000000000..a5172e78535 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_selector.h @@ -0,0 +1,35 @@ +/* +// Copyright (c) 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector { +class embedding_bag_kernel_selector : public kernel_selector_base { +public: + static embedding_bag_kernel_selector& Instance() { + static embedding_bag_kernel_selector instance_; + return instance_; + } + + embedding_bag_kernel_selector(); + + virtual ~embedding_bag_kernel_selector() {} + + KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; +}; +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/embedding_bag_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/embedding_bag_ref.cl new file mode 100644 index 00000000000..4410bf11349 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/embedding_bag_ref.cl @@ -0,0 +1,148 @@ +// Copyright (c) 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "include/include_all.cl" + +#ifdef PACKED_SUM +KERNEL(embedding_bag_ref)( + const __global INPUT0_TYPE* emb_table, + const __global INPUT1_TYPE* indices, +#ifdef INPUT2_TYPE + const __global INPUT2_TYPE* weights, +#endif + __global OUTPUT_TYPE* output) +{ + const uint batch = get_global_id(0); + const uint emb_dim1 = get_global_id(1); + const uint emb_dim2 = (uint)get_global_id(2) / OUTPUT_SIZE_X; + const uint emb_dim3 = (uint)get_global_id(2) % OUTPUT_SIZE_X; + + OUTPUT_TYPE res = OUTPUT_VAL_ZERO; + for (int i = 0; i < INPUT1_FEATURE_NUM; ++i) + { + uint indices_index = INPUT1_GET_INDEX(batch, i, 0, 0); + uint emb_index = INPUT0_GET_INDEX(indices[indices_index], emb_dim1, emb_dim2, emb_dim3); + OUTPUT_TYPE val = emb_table[emb_index]; +#ifdef INPUT2_TYPE + { + uint weight_index = INPUT2_GET_INDEX(batch, i, 0, 0); + val *= weights[weight_index]; + } +#endif + res += val; + } + uint out_ind = OUTPUT_GET_INDEX(batch, emb_dim1, emb_dim2, emb_dim3); + output[out_ind] = ACTIVATION(TO_OUTPUT_TYPE(res), ACTIVATION_PARAMS); +} +#endif // PACKED_SUM + +#ifdef OFFSETS_SUM +KERNEL(embedding_bag_ref)( + const __global INPUT0_TYPE* emb_table, + const __global INPUT1_TYPE* indices, + const __global INPUT2_TYPE* offsets, +#ifdef INPUT3_TYPE + const __global INPUT3_TYPE* weights, +#endif + __global OUTPUT_TYPE* output) +{ + const uint batch = get_global_id(0); + const uint emb_dim1 = get_global_id(1); + const uint emb_dim2 = (uint)get_global_id(2) / OUTPUT_SIZE_X; + const uint emb_dim3 = (uint)get_global_id(2) % OUTPUT_SIZE_X; + + uint offsets_ind = INPUT2_OFFSET + batch; + uint start_indices = INPUT1_OFFSET + offsets[offsets_ind]; + offsets_ind = INPUT2_OFFSET + batch + 1; + uint end_indices = (batch < OUTPUT_BATCH_NUM - 1) ? + INPUT1_OFFSET + offsets[offsets_ind] : + INPUT1_LENGTH; + + OUTPUT_TYPE res = OUTPUT_VAL_ZERO; + for (int i = start_indices; i < end_indices; ++i) + { + uint indices_index = INPUT1_OFFSET + i; + uint emb_index = INPUT0_GET_INDEX(indices[indices_index], emb_dim1, emb_dim2, emb_dim3); + OUTPUT_TYPE val = emb_table[emb_index]; +#ifdef INPUT3_TYPE + { + uint weight_index = INPUT3_OFFSET + i; + val *= weights[weight_index]; + } +#endif + res += val; + } + +#ifdef DEFAULT_INDEX + if (start_indices == end_indices) { + { + uint emb_index = INPUT0_GET_INDEX(DEFAULT_INDEX, emb_dim1, emb_dim2, emb_dim3); + res = emb_table[emb_index]; + } + } +#endif + + uint out_ind = OUTPUT_GET_INDEX(batch, emb_dim1, emb_dim2, emb_dim3); + output[out_ind] = ACTIVATION(TO_OUTPUT_TYPE(res), ACTIVATION_PARAMS); +} +#endif // OFFSETS_SUM + +#ifdef SEGMENTS_SUM +KERNEL(embedding_bag_ref)( + const __global INPUT0_TYPE* emb_table, + const __global INPUT1_TYPE* indices, + const __global INPUT2_TYPE* segment_ids, +#ifdef INPUT3_TYPE + const __global INPUT3_TYPE* weights, +#endif + __global OUTPUT_TYPE* output) +{ + const uint batch = get_global_id(0); + const uint emb_dim1 = get_global_id(1); + const uint emb_dim2 = (uint)get_global_id(2) / OUTPUT_SIZE_X; + const uint emb_dim3 = (uint)get_global_id(2) % OUTPUT_SIZE_X; + + OUTPUT_TYPE res = OUTPUT_VAL_ZERO; + bool found = false; + for (int i = 0; i < INPUT2_LENGTH; ++i) { + uint id = segment_ids[INPUT2_OFFSET + i]; + if (id > batch) + break; + if (id == batch) { + found = true; + uint index = indices[INPUT1_OFFSET + i]; + uint emb_index = INPUT0_GET_INDEX(index, emb_dim1, emb_dim2, emb_dim3); + OUTPUT_TYPE val = emb_table[emb_index]; +#ifdef INPUT3_TYPE + { + uint weight_index = INPUT3_OFFSET + i; + val *= weights[weight_index]; + } +#endif + res += val; + } + } + +#ifdef DEFAULT_INDEX + if (!found) { + uint emb_index = INPUT0_GET_INDEX(DEFAULT_INDEX, emb_dim1, emb_dim2, emb_dim3); + res = emb_table[emb_index]; + } +#endif + + uint out_ind = OUTPUT_GET_INDEX(batch, emb_dim1, emb_dim2, emb_dim3); + output[out_ind] = ACTIVATION(TO_OUTPUT_TYPE(res), ACTIVATION_PARAMS); +} +#endif // SEGMENTS_SUM diff --git a/inference-engine/thirdparty/clDNN/src/embedding_bag.cpp b/inference-engine/thirdparty/clDNN/src/embedding_bag.cpp new file mode 100644 index 00000000000..c5bf30b23f2 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/embedding_bag.cpp @@ -0,0 +1,70 @@ +/* +// Copyright (c) 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "embedding_bag_inst.h" + +#include "primitive_type_base.h" +#include "error_handler.h" +#include "json_object.h" +#include + +namespace cldnn { +primitive_type_id embedding_bag::type_id() { + static primitive_type_base instance; + return &instance; +} + +layout embedding_bag_inst::calc_output_layout(embedding_bag_node const& node) { + auto desc = node.get_primitive(); + + auto input_layout = node.input(0).get_output_layout(); + auto output_format = input_layout.format; + + auto output_shape = desc->output_shape; + + return layout(input_layout.data_type, output_format, output_shape); +} + +std::string embedding_bag_inst::to_string(embedding_bag_node const& node) { + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + auto& input = node.input(); + + std::stringstream primitive_description; + + json_composite embedding_bag_info; + embedding_bag_info.add("input id", input.id()); + switch(desc->type) { + case embedding_bag::packed_sum: + embedding_bag_info.add("embedding bag type", "PackedSum"); + break; + case embedding_bag::offsets_sum: + embedding_bag_info.add("embedding bag type", "OffsetsSum"); + break; + case embedding_bag::segments_sum: + embedding_bag_info.add("embedding bag type", "SegmentsSum"); + break; + } + + node_info->add("embedding_bag info", embedding_bag_info); + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +embedding_bag_inst::typed_primitive_inst(network_impl& network, embedding_bag_node const& node) + : parent(network, node) {} +} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/src/gpu/embedding_bag_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/embedding_bag_gpu.cpp new file mode 100644 index 00000000000..8d96591fc13 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gpu/embedding_bag_gpu.cpp @@ -0,0 +1,85 @@ +/* +// Copyright (c) 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "embedding_bag_inst.h" +#include "primitive_gpu_base.h" +#include "implementation_map.h" +#include "kernel_selector_helper.h" +#include "embedding_bag/embedding_bag_kernel_selector.h" +#include "embedding_bag/embedding_bag_kernel_ref.h" +#include "error_handler.h" +#include "data_inst.h" + +using namespace cldnn; + +namespace cldnn { +namespace gpu { +struct embedding_bag_gpu : typed_primitive_gpu_impl { + using parent = typed_primitive_gpu_impl; + using parent::parent; + +public: + static primitive_impl* create(const embedding_bag_node& arg) { + auto embedding_bag_params = get_default_params(arg); + auto embedding_bag_optional_params = + get_default_optional_params(arg.get_program()); + + switch (arg.get_primitive()->type) { + case embedding_bag::packed_sum: + embedding_bag_params.type = kernel_selector::EmbeddingBagType::PACKED_SUM; + break; + case embedding_bag::offsets_sum: + embedding_bag_params.type = kernel_selector::EmbeddingBagType::OFFSETS_SUM; + break; + case embedding_bag::segments_sum: + embedding_bag_params.type = kernel_selector::EmbeddingBagType::SEGMENTS_SUM; + break; + default: + CLDNN_ERROR_MESSAGE(arg.id(), "Unknown EmbeddingBag type"); + break; + } + + for (size_t i = 1; i < arg.inputs_count(); i++) { + embedding_bag_params.inputs.push_back(convert_data_tensor(arg.input(i).get_output_layout())); + } + + embedding_bag_params.default_index = arg.get_primitive()->default_index; + + auto& kernel_selector = kernel_selector::embedding_bag_kernel_selector::Instance(); + auto best_kernels = kernel_selector.GetBestKernels(embedding_bag_params, embedding_bag_optional_params); + + CLDNN_ERROR_BOOL(arg.id(), + "Best_kernel.empty()", + best_kernels.empty(), + "Cannot find a proper kernel with this arguments"); + + auto embedding_bag = new embedding_bag_gpu(arg, best_kernels[0]); + + return embedding_bag; + } +}; + +namespace detail { + +attach_embedding_bag_gpu::attach_embedding_bag_gpu() { + auto val_fw = embedding_bag_gpu::create; + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw); +} + +} // namespace detail +} // namespace gpu +} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.cpp index 1eac3183a0c..032c6809491 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.cpp @@ -98,6 +98,7 @@ void register_implementations_gpu() { REGISTER_GPU(grn); REGISTER_GPU(ctc_greedy_decoder); REGISTER_GPU(cum_sum); + REGISTER_GPU(embedding_bag); } } // namespace gpu diff --git a/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.hpp b/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.hpp index 34e7db5be54..d79a1b136b3 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.hpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.hpp @@ -175,6 +175,7 @@ REGISTER_GPU(non_max_suppression); REGISTER_GPU(grn); REGISTER_GPU(ctc_greedy_decoder); REGISTER_GPU(cum_sum); +REGISTER_GPU(embedding_bag); #undef REGISTER_GPU diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp index 591efacc8a7..ec5db1e8e68 100644 --- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp @@ -50,6 +50,7 @@ #include "shuffle_channels_inst.h" #include "strided_slice_inst.h" #include "cum_sum_inst.h" +#include "embedding_bag_inst.h" #include #include #include @@ -200,7 +201,8 @@ void prepare_primitive_fusing::fuse_activations(program_impl &p) { !input.is_type() && !input.is_type() && !input.is_type() && !input.is_type() && !input.is_type() && !input.is_type() && !input.is_type() && !input.is_type() && !input.is_type() && - !input.is_type() && !input.is_type())) + !input.is_type() && !input.is_type() && + !input.is_type())) return; if (input.is_type()) { diff --git a/inference-engine/thirdparty/clDNN/src/include/embedding_bag_inst.h b/inference-engine/thirdparty/clDNN/src/include/embedding_bag_inst.h new file mode 100644 index 00000000000..f24dc7d72fc --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/include/embedding_bag_inst.h @@ -0,0 +1,49 @@ +/* +// Copyright (c) 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once +#include "api/embedding_bag.hpp" + +#include "primitive_inst.h" +#include + +namespace cldnn { +template <> +struct typed_program_node : public typed_program_node_base { + using parent = typed_program_node_base; + +public: + using parent::parent; + + program_node& input(size_t index = 0) const { return get_dependency(index); } + size_t inputs_count() const { return get_dependencies().size(); } +}; + +using embedding_bag_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base { + using parent = typed_primitive_inst_base; + +public: + static layout calc_output_layout(embedding_bag_node const& node); + static std::string to_string(embedding_bag_node const& node); + typed_primitive_inst(network_impl& network, embedding_bag_node const& desc); +}; + +using embedding_bag_inst = typed_primitive_inst; +} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/embedding_bag_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/embedding_bag_gpu_test.cpp new file mode 100644 index 00000000000..26166aa37f0 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/embedding_bag_gpu_test.cpp @@ -0,0 +1,1433 @@ +// Copyright (c) 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +using namespace cldnn; +using namespace ::tests; + +TEST(embedding_bag_fp16_gpu, packed_sum_basic) { + // emb_table : 5x2 + // indices : 3x2 + // per_sample_weights : 3x2 + // Output : 3x2 + // Input values in fp16 + const auto& engine = get_test_engine(); + + auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } }); + auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 2, 1, 1 } }); + auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 3, 2, 1, 1 } }); + tensor output_shape = {3, 2, 1, 1}; + + set_values(emb_table, { + FLOAT16(-0.2f), FLOAT16(-0.6f), + FLOAT16(-0.1f), FLOAT16(-0.4f), + FLOAT16(-1.9f), FLOAT16(-1.8f), + FLOAT16(-1.0f), FLOAT16(1.5f), + FLOAT16(0.8f), FLOAT16(-0.7f) + }); + set_values(indices, { + 0, 2, + 1, 2, + 3, 4 + }); + set_values(per_sample_weights, { + FLOAT16(0.5f), FLOAT16(0.5f), + FLOAT16(0.5f), FLOAT16(0.5f), + FLOAT16(0.5f), FLOAT16(0.5f) + }); + + auto type = embedding_bag::packed_sum; + topology topology; + topology.add(input_layout("Input0", emb_table.get_layout())); + topology.add(input_layout("Input1", indices.get_layout())); + topology.add(data("Input2", per_sample_weights)); + topology.add( + embedding_bag("embedding_bag", {"Input0", "Input1", "Input2"}, type, output_shape) + ); + + network network(engine, topology); + + network.set_input_data("Input0", emb_table); + network.set_input_data("Input1", indices); + + auto outputs = network.execute(); + + auto output = outputs.at("embedding_bag").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + -1.05f, -1.2f, + -1.f, -1.1f, + -0.1f, 0.4f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i; + } +} + +TEST(embedding_bag_fp16_gpu, packed_sum_basic_without_weights) { + // emb_table : 5x2 + // indices : 3x2 + // Output : 3x2 + // Input values in fp16 + const auto& engine = get_test_engine(); + + auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } }); + auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 2, 1, 1 } }); + tensor output_shape = {3, 2, 1, 1}; + + set_values(emb_table, { + FLOAT16(-0.2f), FLOAT16(-0.6f), + FLOAT16(-0.1f), FLOAT16(-0.4f), + FLOAT16(-1.9f), FLOAT16(-1.8f), + FLOAT16(-1.0f), FLOAT16(1.5f), + FLOAT16(0.8f), FLOAT16(-0.7f) + }); + set_values(indices, { + 0, 2, + 1, 2, + 3, 4 + }); + + auto type = embedding_bag::packed_sum; + topology topology; + topology.add(input_layout("Input0", emb_table.get_layout())); + topology.add(input_layout("Input1", indices.get_layout())); + topology.add( + embedding_bag("embedding_bag", {"Input0", "Input1"}, type, output_shape) + ); + + network network(engine, topology); + + network.set_input_data("Input0", emb_table); + network.set_input_data("Input1", indices); + + auto outputs = network.execute(); + + auto output = outputs.at("embedding_bag").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + -2.1f, -2.4f, + -2.f, -2.2f, + -0.2f, 0.8f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i; + } +} + +TEST(embedding_bag_fp16_gpu, packed_sum_dim2) { + // emb_table : 5x2x2 + // indices : 3x2 + // per_sample_weights : 3x2 + // Output : 3x2x2 + // Input values in fp16 + // Input values in fp16 + const auto& engine = get_test_engine(); + + auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 2, 1 } }); + auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 2, 1, 1 } }); + auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 3, 2, 1, 1 } }); + tensor output_shape = {3, 2, 2, 1}; + + /* + * [ 5 + * [ 2 + * [ -0.2, 1.3 ], [ 0.5, -0.3 ] + * ], + * [ 2 + * [ 2.3, 1.3 ], [ -0.4, -0.7 ] + * ], + * [ 2 + * [ 3.3, -4.1 ], [ 2.1, 0.8 ] + * ], + * [ 2 + * [ 3.5, -5.7 ], [ -0.1, 0.3 ] + * ], + * [ 2 + * [ 0.3, 1.0 ], [ 2.3, -4.1 ] + * ] + * ] + */ + set_values(emb_table, { + FLOAT16(-0.2f), FLOAT16( 1.3f), FLOAT16( 0.5f), FLOAT16(-0.3f), + FLOAT16( 2.3f), FLOAT16( 1.3f), FLOAT16(-0.4f), FLOAT16(-0.7f), + FLOAT16( 3.3f), FLOAT16(-4.1f), FLOAT16( 2.1f), FLOAT16( 0.8f), + FLOAT16( 3.5f), FLOAT16(-5.7f), FLOAT16(-0.1f), FLOAT16( 0.3f), + FLOAT16( 0.3f), FLOAT16( 1.0f), FLOAT16( 2.3f), FLOAT16(-4.1f) + }); + set_values(indices, { + 0, 2, + 1, 2, + 3, 4 + }); + set_values(per_sample_weights, { + FLOAT16(0.5f), FLOAT16(0.5f), + FLOAT16(0.5f), FLOAT16(0.5f), + FLOAT16(0.5f), FLOAT16(0.5f) + }); + + auto type = embedding_bag::packed_sum; + topology topology; + topology.add(input_layout("Input0", emb_table.get_layout())); + topology.add(input_layout("Input1", indices.get_layout())); + topology.add(data("Input2", per_sample_weights)); + topology.add( + embedding_bag("embedding_bag", {"Input0", "Input1", "Input2"}, type, output_shape) + ); + + network network(engine, topology); + + network.set_input_data("Input0", emb_table); + network.set_input_data("Input1", indices); + + auto outputs = network.execute(); + + auto output = outputs.at("embedding_bag").get_memory(); + auto output_ptr = output.pointer(); + + /* + * [ 3 + * [ 2 + * [ 1.55, -1.4 ], [ 1.3, 0.25 ] + * ], + * [ 2 + * [ 2.8, -1.4 ], [ 0.85, 0.05 ] + * ], + * [ 2 + * [ 1.9, -2.35 ], [ 1.1, -1.9 ] + * ], + * ] + */ + std::vector expected_results = { + 1.55f, -1.4f, 1.3f, 0.25f, + 2.8f, -1.4f, 0.85f, 0.05f, + 1.9f, -2.35f, 1.1f, -1.9f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]), static_cast(1e-2))) << i; + } +} + +TEST(embedding_bag_fp16_gpu, packed_sum_dim3) { + // emb_table : 5x2x3x2 + // indices : 3x2 + // per_sample_weights : 3x2 + // Output : 3x2x3x2 + // Input values in fp16 + const auto& engine = get_test_engine(); + + auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 3, 2 } }); + auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 2, 1, 1 } }); + auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 3, 2, 1, 1 } }); + tensor output_shape = {3, 2, 3, 2}; + + /* + * [ 5 + * [ 2 + * [ 3 + * [ -0.2, 1.3 ], [ 0.5, -0.3 ], [ 0.4, -0.4 ] + * ], + * [ 3 + * [ -0.1, 1.0 ], [ 2.1, 0.7 ], [ -0.2, -0.7 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 1.9, -2.4 ], [ 3.4, -0.7 ], [ -0.4, 0.5 ] + * ], + * [ 3 + * [ 2.3, 1.3 ], [ -0.4, -0.7 ], [ 1.8, -0.9 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 1.5, -2.4 ], [ 4.2, 3.2 ], [ -0.6, 0.9 ] + * ], + * [ 3 + * [ 3.3, -4.1 ], [ 2.1, 0.8 ], [ 5.2, -2.5 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 0.8, -1.9 ], [ 0.7, 3.4 ], [ -3.3, 0.1 ] + * ], + * [ 3 + * [ 3.5, -5.7 ], [ -0.1, 0.3 ], [ 0.4, 3.3 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 6.1, 8.3 ], [ 0.4, -4.4 ], [ -5.2, 0.9 ] + * ], + * [ 3 + * [ 0.3, 1.0 ], [ 2.3, -4.1 ], [ 2.0, -5.7 ] + * ], + * ] + * ] + */ + set_values(emb_table, { + FLOAT16(-0.2f), FLOAT16( 1.3f), FLOAT16( 0.5f), FLOAT16(-0.3f), FLOAT16( 0.4f), FLOAT16(-0.4f), + FLOAT16(-0.1f), FLOAT16( 1.0f), FLOAT16( 2.1f), FLOAT16( 0.7f), FLOAT16(-0.2f), FLOAT16(-0.7f), + FLOAT16( 1.9f), FLOAT16(-2.4f), FLOAT16( 3.4f), FLOAT16(-0.7f), FLOAT16(-0.4f), FLOAT16( 0.5f), + FLOAT16( 2.3f), FLOAT16( 1.3f), FLOAT16(-0.4f), FLOAT16(-0.7f), FLOAT16( 1.8f), FLOAT16(-0.9f), + FLOAT16( 1.5f), FLOAT16(-2.4f), FLOAT16( 4.2f), FLOAT16( 3.2f), FLOAT16(-0.6f), FLOAT16( 0.9f), + FLOAT16( 3.3f), FLOAT16(-4.1f), FLOAT16( 2.1f), FLOAT16( 0.8f), FLOAT16( 5.2f), FLOAT16(-2.5f), + FLOAT16( 0.8f), FLOAT16(-1.9f), FLOAT16( 0.7f), FLOAT16( 3.4f), FLOAT16(-3.3f), FLOAT16( 0.1f), + FLOAT16( 3.5f), FLOAT16(-5.7f), FLOAT16(-0.1f), FLOAT16( 0.3f), FLOAT16( 0.4f), FLOAT16( 3.3f), + FLOAT16( 6.1f), FLOAT16( 8.3f), FLOAT16( 0.4f), FLOAT16(-4.4f), FLOAT16(-5.2f), FLOAT16( 0.9f), + FLOAT16( 0.3f), FLOAT16( 1.0f), FLOAT16( 2.3f), FLOAT16(-4.1f), FLOAT16( 2.0f), FLOAT16(-5.7f) + }); + set_values(indices, { + 0, 2, + 1, 2, + 3, 4 + }); + set_values(per_sample_weights, { + FLOAT16(0.5f), FLOAT16(0.5f), + FLOAT16(0.5f), FLOAT16(0.5f), + FLOAT16(0.5f), FLOAT16(0.5f) + }); + + auto type = embedding_bag::packed_sum; + topology topology; + topology.add(input_layout("Input0", emb_table.get_layout())); + topology.add(input_layout("Input1", indices.get_layout())); + topology.add(data("Input2", per_sample_weights)); + topology.add( + embedding_bag("embedding_bag", {"Input0", "Input1", "Input2"}, type, output_shape) + ); + + network network(engine, topology); + + network.set_input_data("Input0", emb_table); + network.set_input_data("Input1", indices); + + auto outputs = network.execute(); + + auto output = outputs.at("embedding_bag").get_memory(); + auto output_ptr = output.pointer(); + + /* + * [ 3 + * [ 2 + * [ 3 + * [ 0.65, -0.55 ], [ 2.35, 1.45 ], [ -0.1, 0.25 ] + * ], + * [ 3 + * [ 1.6, -1.55 ], [ 2.1, 0.75 ], [ 2.5, -1.6 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 1.7, -2.4 ], [ 3.8, 1.25 ], [ -0.5, 0.7 ] + * ], + * [ 3 + * [ 2.8, -1.4 ], [ 0.85, 0.05 ], [ 3.5, -1.7 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 3.45, 3.2 ], [ 0.55, -0.5 ], [ -4.25, 0.5 ] + * ], + * [ 3 + * [ 1.9, -2.35 ], [ 1.1, -1.9 ], [ 1.2, -1.2 ] + * ], + * ] + * ] + */ + std::vector expected_results = { + 0.65f, -0.55f, 2.35f, 1.45f, -0.1f, 0.25f, + 1.6f, -1.55f, 2.1f, 0.75f, 2.5f, -1.6f, + 1.7f, -2.4f, 3.8f, 1.25f, -0.5f, 0.7f, + 2.8f, -1.4f, 0.85f, 0.05f, 3.5f, -1.7f, + 3.45f, 3.2f, 0.55f, -0.5f, -4.25f, 0.5f, + 1.9f, -2.35f, 1.1f, -1.9f, 1.2f, -1.2f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]), static_cast(1e-2))) << i; + } +} + +TEST(embedding_bag_fp16_gpu, offsets_sum_basic) { + // emb_table : 5x2 + // indices : 4x1 + // offsets : 3x1 + // per_sample_weights : 4x1 + // default_index : 1x1 + // Output : 3x2 + // Input values in fp16 + const auto& engine = get_test_engine(); + + auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } }); + auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto offsets = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 1, 1, 1 } }); + auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 4, 1, 1, 1 } }); + tensor output_shape = {3, 2, 1, 1}; + + set_values(emb_table, { + FLOAT16(-0.2f), FLOAT16(-0.6f), + FLOAT16(-0.1f), FLOAT16(-0.4f), + FLOAT16(-1.9f), FLOAT16(-1.8f), + FLOAT16(-1.0f), FLOAT16(1.5f), + FLOAT16(0.8f), FLOAT16(-0.7f) + }); + set_values(indices, { + 0, 2, 3, 4 + }); + set_values(offsets, { + 0, 2, 2 + }); + set_values(per_sample_weights, { + FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f) + }); + + auto type = embedding_bag::offsets_sum; + topology topology; + topology.add(input_layout("Input0", emb_table.get_layout())); + topology.add(input_layout("Input1", indices.get_layout())); + topology.add(input_layout("Input2", offsets.get_layout())); + topology.add(data("Input3", per_sample_weights)); + topology.add( + embedding_bag("embedding_bag", {"Input0", "Input1", "Input2", "Input3"}, type, output_shape, 0) + ); + network network(engine, topology); + + network.set_input_data("Input0", emb_table); + network.set_input_data("Input1", indices); + network.set_input_data("Input2", offsets); + + auto outputs = network.execute(); + + auto output = outputs.at("embedding_bag").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + -1.05f, -1.2f, + -0.2f, -0.6f, + -0.1f, 0.4f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i; + } +} + +TEST(embedding_bag_fp16_gpu, offsets_sum_basic_first_empty) { + // emb_table : 5x2 + // indices : 4x1 + // offsets : 3x1 + // per_sample_weights : 4x1 + // default_index : 1x1 + // Output : 3x2 + // Input values in fp16 + const auto& engine = get_test_engine(); + + auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } }); + auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto offsets = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 1, 1, 1 } }); + auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 4, 1, 1, 1 } }); + tensor output_shape = {3, 2, 1, 1}; + + set_values(emb_table, { + FLOAT16(-0.2f), FLOAT16(-0.6f), + FLOAT16(-0.1f), FLOAT16(-0.4f), + FLOAT16(-1.9f), FLOAT16(-1.8f), + FLOAT16(-1.0f), FLOAT16(1.5f), + FLOAT16(0.8f), FLOAT16(-0.7f) + }); + set_values(indices, { + 0, 2, 3, 4 + }); + set_values(offsets, { + 0, 0, 2 + }); + set_values(per_sample_weights, { + FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f) + }); + + auto type = embedding_bag::offsets_sum; + topology topology; + topology.add(input_layout("Input0", emb_table.get_layout())); + topology.add(input_layout("Input1", indices.get_layout())); + topology.add(input_layout("Input2", offsets.get_layout())); + topology.add(data("Input3", per_sample_weights)); + topology.add( + embedding_bag("embedding_bag", {"Input0", "Input1", "Input2", "Input3"}, type, output_shape, 2) + ); + + network network(engine, topology); + + network.set_input_data("Input0", emb_table); + network.set_input_data("Input1", indices); + network.set_input_data("Input2", offsets); + + auto outputs = network.execute(); + + auto output = outputs.at("embedding_bag").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + -1.9f, -1.8f, + -1.05f, -1.2f, + -0.1f, 0.4f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i; + } +} + +TEST(embedding_bag_fp16_gpu, offsets_sum_basic_last_empty) { + // emb_table : 5x2 + // indices : 4x1 + // offsets : 3x1 + // per_sample_weights : 4x1 + // default_index : 1x1 + // Output : 3x2 + // Input values in fp16 + const auto& engine = get_test_engine(); + + auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } }); + auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto offsets = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 1, 1, 1 } }); + auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 4, 1, 1, 1 } }); + tensor output_shape = {3, 2, 1, 1}; + + set_values(emb_table, { + FLOAT16(-0.2f), FLOAT16(-0.6f), + FLOAT16(-0.1f), FLOAT16(-0.4f), + FLOAT16(-1.9f), FLOAT16(-1.8f), + FLOAT16(-1.0f), FLOAT16(1.5f), + FLOAT16(0.8f), FLOAT16(-0.7f) + }); + set_values(indices, { + 0, 2, 3, 4 + }); + set_values(offsets, { + 0, 2, 4 + }); + set_values(per_sample_weights, { + FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f) + }); + + auto type = embedding_bag::offsets_sum; + topology topology; + topology.add(input_layout("Input0", emb_table.get_layout())); + topology.add(input_layout("Input1", indices.get_layout())); + topology.add(input_layout("Input2", offsets.get_layout())); + topology.add(data("Input3", per_sample_weights)); + topology.add( + embedding_bag("embedding_bag", {"Input0", "Input1", "Input2", "Input3"}, type, output_shape, 2) + ); + + network network(engine, topology); + + network.set_input_data("Input0", emb_table); + network.set_input_data("Input1", indices); + network.set_input_data("Input2", offsets); + + auto outputs = network.execute(); + + auto output = outputs.at("embedding_bag").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + -1.05f, -1.2f, + -0.1f, 0.4f, + -1.9f, -1.8f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i; + } +} + +TEST(embedding_bag_fp16_gpu, offsets_sum_without_weights_and_def_index) { + // emb_table : 5x2 + // indices : 4x1 + // offsets : 3x1 + // Output : 3x2 + // Input values in fp16 + const auto& engine = get_test_engine(); + + auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } }); + auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto offsets = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 1, 1, 1 } }); + tensor output_shape = {3, 2, 1, 1}; + + set_values(emb_table, { + FLOAT16(-0.2f), FLOAT16(-0.6f), + FLOAT16(-0.1f), FLOAT16(-0.4f), + FLOAT16(-1.9f), FLOAT16(-1.8f), + FLOAT16(-1.0f), FLOAT16(1.5f), + FLOAT16(0.8f), FLOAT16(-0.7f) + }); + set_values(indices, { + 0, 2, 3, 4 + }); + set_values(offsets, { + 0, 2, 2 + }); + + auto type = embedding_bag::offsets_sum; + topology topology; + topology.add(input_layout("Input0", emb_table.get_layout())); + topology.add(input_layout("Input1", indices.get_layout())); + topology.add(input_layout("Input2", offsets.get_layout())); + topology.add( + embedding_bag("embedding_bag", {"Input0", "Input1", "Input2"}, type, output_shape) + ); + + network network(engine, topology); + + network.set_input_data("Input0", emb_table); + network.set_input_data("Input1", indices); + network.set_input_data("Input2", offsets); + + auto outputs = network.execute(); + + auto output = outputs.at("embedding_bag").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + -2.1f, -2.4f, + 0, 0, + -0.2f, 0.8f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i; + } +} + +TEST(embedding_bag_fp16_gpu, offsets_sum_dim3) { + // emb_table : 5x2x3x2 + // indices : 4x1 + // offsets : 3x1 + // per_sample_weights : 4x1 + // default_index : 1x1 + // Output : 3x2x3x2 + // Input values in fp16 + const auto& engine = get_test_engine(); + + auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 3, 2 } }); + auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto offsets = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 1, 1, 1 } }); + auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 4, 1, 1, 1 } }); + tensor output_shape = {3, 2, 3, 2}; + + /* + * [ 5 + * [ 2 + * [ 3 + * [ -0.2, 1.3 ], [ 0.5, -0.3 ], [ 0.4, -0.4 ] + * ], + * [ 3 + * [ -0.1, 1.0 ], [ 2.1, 0.7 ], [ -0.2, -0.7 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 1.9, -2.4 ], [ 3.4, -0.7 ], [ -0.4, 0.5 ] + * ], + * [ 3 + * [ 2.3, 1.3 ], [ -0.4, -0.7 ], [ 1.8, -0.9 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 1.5, -2.4 ], [ 4.2, 3.2 ], [ -0.6, 0.9 ] + * ], + * [ 3 + * [ 3.3, -4.1 ], [ 2.1, 0.8 ], [ 5.2, -2.5 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 0.8, -1.9 ], [ 0.7, 3.4 ], [ -3.3, 0.1 ] + * ], + * [ 3 + * [ 3.5, -5.7 ], [ -0.1, 0.3 ], [ 0.4, 3.3 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 6.1, 8.3 ], [ 0.4, -4.4 ], [ -5.2, 0.9 ] + * ], + * [ 3 + * [ 0.3, 1.0 ], [ 2.3, -4.1 ], [ 2.0, -5.7 ] + * ], + * ] + * ] + */ + set_values(emb_table, { + FLOAT16(-0.2f), FLOAT16( 1.3f), FLOAT16( 0.5f), FLOAT16(-0.3f), FLOAT16( 0.4f), FLOAT16(-0.4f), + FLOAT16(-0.1f), FLOAT16( 1.0f), FLOAT16( 2.1f), FLOAT16( 0.7f), FLOAT16(-0.2f), FLOAT16(-0.7f), + FLOAT16( 1.9f), FLOAT16(-2.4f), FLOAT16( 3.4f), FLOAT16(-0.7f), FLOAT16(-0.4f), FLOAT16( 0.5f), + FLOAT16( 2.3f), FLOAT16( 1.3f), FLOAT16(-0.4f), FLOAT16(-0.7f), FLOAT16( 1.8f), FLOAT16(-0.9f), + FLOAT16( 1.5f), FLOAT16(-2.4f), FLOAT16( 4.2f), FLOAT16( 3.2f), FLOAT16(-0.6f), FLOAT16( 0.9f), + FLOAT16( 3.3f), FLOAT16(-4.1f), FLOAT16( 2.1f), FLOAT16( 0.8f), FLOAT16( 5.2f), FLOAT16(-2.5f), + FLOAT16( 0.8f), FLOAT16(-1.9f), FLOAT16( 0.7f), FLOAT16( 3.4f), FLOAT16(-3.3f), FLOAT16( 0.1f), + FLOAT16( 3.5f), FLOAT16(-5.7f), FLOAT16(-0.1f), FLOAT16( 0.3f), FLOAT16( 0.4f), FLOAT16( 3.3f), + FLOAT16( 6.1f), FLOAT16( 8.3f), FLOAT16( 0.4f), FLOAT16(-4.4f), FLOAT16(-5.2f), FLOAT16( 0.9f), + FLOAT16( 0.3f), FLOAT16( 1.0f), FLOAT16( 2.3f), FLOAT16(-4.1f), FLOAT16( 2.0f), FLOAT16(-5.7f) + }); + set_values(indices, { + 0, 2, 3, 4 + }); + set_values(offsets, { + 0, 2, 2 + }); + set_values(per_sample_weights, { + FLOAT16(0.5f), FLOAT16(0.5f), + FLOAT16(0.5f), FLOAT16(0.5f) + }); + + auto type = embedding_bag::offsets_sum; + topology topology; + topology.add(input_layout("Input0", emb_table.get_layout())); + topology.add(input_layout("Input1", indices.get_layout())); + topology.add(input_layout("Input2", offsets.get_layout())); + topology.add(data("Input3", per_sample_weights)); + topology.add( + embedding_bag("embedding_bag", {"Input0", "Input1", "Input2", "Input3"}, type, output_shape, 0) + ); + + network network(engine, topology); + + network.set_input_data("Input0", emb_table); + network.set_input_data("Input1", indices); + network.set_input_data("Input2", offsets); + + auto outputs = network.execute(); + + auto output = outputs.at("embedding_bag").get_memory(); + auto output_ptr = output.pointer(); + + /* + * [ 3 + * [ 2 + * [ 3 + * [ 0.65, -0.55 ], [ 2.35, 1.45 ], [ -0.1, 0.25 ] + * ], + * [ 3 + * [ 1.6, -1.55 ], [ 2.1, 0.75 ], [ 2.5, -1.6 ] + * ], + * ], + * [ 2 + * [ 3 + * [ -0.2, 1.3 ], [ 0.5, -0.3 ], [ 0.4, -0.4 ] + * ], + * [ 3 + * [ -0.1, 1.0 ], [ 2.1, 0.7 ], [ -0.2, -0.7 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 3.45, 3.2 ], [ 0.55, -0.5 ], [ -4.25, 0.5 ] + * ], + * [ 3 + * [ 1.9, -2.35 ], [ 1.1, -1.9 ], [ 1.2, -1.2 ] + * ], + * ] + * ] + */ + std::vector expected_results = { + 0.65f, -0.55f, 2.35f, 1.45f, -0.1f, 0.25f, + 1.6f, -1.55f, 2.1f, 0.75f, 2.5f, -1.6f, + -0.2f, 1.3f, 0.5f, -0.3f, 0.4f, -0.4f, + -0.1f, 1.0f, 2.1f, 0.7f, -0.2f, -0.7f, + 3.45f, 3.2f, 0.55f, -0.5f, -4.25f, 0.5f, + 1.9f, -2.35f, 1.1f, -1.9f, 1.2f, -1.2f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]), static_cast(1e-2))) << i; + } +} + +TEST(embedding_bag_fp16_gpu, segments_sum_basic) { + // emb_table : 5x2 + // indices : 4x1 + // segment_ids : 4x1 + // per_sample_weights : 4x1 + // default_index : 1x1 + // Output : 3x2 + // Input values in fp16 + const auto& engine = get_test_engine(); + + auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } }); + auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto segment_ids = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 4, 1, 1, 1 } }); + tensor output_shape = {3, 2, 1, 1}; + + set_values(emb_table, { + FLOAT16(-0.2f), FLOAT16(-0.6f), + FLOAT16(-0.1f), FLOAT16(-0.4f), + FLOAT16(-1.9f), FLOAT16(-1.8f), + FLOAT16(-1.0f), FLOAT16(1.5f), + FLOAT16(0.8f), FLOAT16(-0.7f) + }); + set_values(indices, { + 0, 2, 3, 4 + }); + set_values(segment_ids, { + 0, 0, 2, 2 + }); + set_values(per_sample_weights, { + FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f) + }); + + auto type = embedding_bag::segments_sum; + topology topology; + topology.add(input_layout("Input0", emb_table.get_layout())); + topology.add(input_layout("Input1", indices.get_layout())); + topology.add(input_layout("Input2", segment_ids.get_layout())); + topology.add(data("Input3", per_sample_weights)); + topology.add( + embedding_bag("embedding_bag", {"Input0", "Input1", "Input2", "Input3"}, type, output_shape, 0) + ); + + network network(engine, topology); + + network.set_input_data("Input0", emb_table); + network.set_input_data("Input1", indices); + network.set_input_data("Input2", segment_ids); + + auto outputs = network.execute(); + + auto output = outputs.at("embedding_bag").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + -1.05f, -1.2f, + -0.2f, -0.6f, + -0.1f, 0.4f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i; + } +} + +TEST(embedding_bag_fp16_gpu, segments_sum_basic_first_empty) { + // emb_table : 5x2 + // indices : 4x1 + // segment_ids : 4x1 + // per_sample_weights : 4x1 + // default_index : 1x1 + // Output : 3x2 + // Input values in fp16 + const auto& engine = get_test_engine(); + + auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } }); + auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto segment_ids = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 4, 1, 1, 1 } }); + tensor output_shape = {3, 2, 1, 1}; + + set_values(emb_table, { + FLOAT16(-0.2f), FLOAT16(-0.6f), + FLOAT16(-0.1f), FLOAT16(-0.4f), + FLOAT16(-1.9f), FLOAT16(-1.8f), + FLOAT16(-1.0f), FLOAT16(1.5f), + FLOAT16(0.8f), FLOAT16(-0.7f) + }); + set_values(indices, { + 0, 2, 3, 4 + }); + set_values(segment_ids, { + 1, 1, 2, 2 + }); + set_values(per_sample_weights, { + FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f) + }); + + auto type = embedding_bag::segments_sum; + topology topology; + topology.add(input_layout("Input0", emb_table.get_layout())); + topology.add(input_layout("Input1", indices.get_layout())); + topology.add(input_layout("Input2", segment_ids.get_layout())); + topology.add(data("Input3", per_sample_weights)); + topology.add( + embedding_bag("embedding_bag", {"Input0", "Input1", "Input2", "Input3"}, type, output_shape, 2) + ); + + network network(engine, topology); + + network.set_input_data("Input0", emb_table); + network.set_input_data("Input1", indices); + network.set_input_data("Input2", segment_ids); + + auto outputs = network.execute(); + + auto output = outputs.at("embedding_bag").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + -1.9f, -1.8f, + -1.05f, -1.2f, + -0.1f, 0.4f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i; + } +} + +TEST(embedding_bag_fp16_gpu, segments_sum_basic_last_empty) { + // emb_table : 5x2 + // indices : 4x1 + // segment_ids : 4x1 + // per_sample_weights : 4x1 + // default_index : 1x1 + // Output : 3x2 + // Input values in fp16 + const auto& engine = get_test_engine(); + + auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } }); + auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto segment_ids = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 4, 1, 1, 1 } }); + tensor output_shape = {3, 2, 1, 1}; + + set_values(emb_table, { + FLOAT16(-0.2f), FLOAT16(-0.6f), + FLOAT16(-0.1f), FLOAT16(-0.4f), + FLOAT16(-1.9f), FLOAT16(-1.8f), + FLOAT16(-1.0f), FLOAT16(1.5f), + FLOAT16(0.8f), FLOAT16(-0.7f) + }); + set_values(indices, { + 0, 2, 3, 4 + }); + set_values(segment_ids, { + 0, 0, 1, 1 + }); + set_values(per_sample_weights, { + FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f) + }); + + auto type = embedding_bag::segments_sum; + topology topology; + topology.add(input_layout("Input0", emb_table.get_layout())); + topology.add(input_layout("Input1", indices.get_layout())); + topology.add(input_layout("Input2", segment_ids.get_layout())); + topology.add(data("Input3", per_sample_weights)); + topology.add( + embedding_bag("embedding_bag", {"Input0", "Input1", "Input2", "Input3"}, type, output_shape, 2) + ); + + network network(engine, topology); + + network.set_input_data("Input0", emb_table); + network.set_input_data("Input1", indices); + network.set_input_data("Input2", segment_ids); + + auto outputs = network.execute(); + + auto output = outputs.at("embedding_bag").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + -1.05f, -1.2f, + -0.1f, 0.4f, + -1.9f, -1.8f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i; + } +} + +TEST(embedding_bag_fp16_gpu, segments_sum_without_weights_and_def_index) { + // emb_table : 5x2 + // indices : 4x1 + // segment_ids : 4x1 + // Output : 3x2 + // Input values in fp16 + const auto& engine = get_test_engine(); + + auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } }); + auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto segment_ids = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + tensor output_shape = {3, 2, 1, 1}; + + set_values(emb_table, { + FLOAT16(-0.2f), FLOAT16(-0.6f), + FLOAT16(-0.1f), FLOAT16(-0.4f), + FLOAT16(-1.9f), FLOAT16(-1.8f), + FLOAT16(-1.0f), FLOAT16(1.5f), + FLOAT16(0.8f), FLOAT16(-0.7f) + }); + set_values(indices, { + 0, 2, 3, 4 + }); + set_values(segment_ids, { + 0, 0, 2, 2 + }); + + auto type = embedding_bag::segments_sum; + topology topology; + topology.add(input_layout("Input0", emb_table.get_layout())); + topology.add(input_layout("Input1", indices.get_layout())); + topology.add(input_layout("Input2", segment_ids.get_layout())); + topology.add( + embedding_bag("embedding_bag", {"Input0", "Input1", "Input2"}, type, output_shape) + ); + + network network(engine, topology); + + network.set_input_data("Input0", emb_table); + network.set_input_data("Input1", indices); + network.set_input_data("Input2", segment_ids); + + auto outputs = network.execute(); + + auto output = outputs.at("embedding_bag").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + -2.1f, -2.4f, + 0, 0, + -0.2f, 0.8f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i; + } +} + +TEST(embedding_bag_fp16_gpu, segments_sum_dim3) { + // emb_table : 5x2x3x2 + // indices : 4x1 + // segment_ids : 4x1 + // per_sample_weights : 4x1 + // default_index : 1x1 + // Output : 3x2x3x2 + // Input values in fp16 + const auto& engine = get_test_engine(); + + auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 3, 2 } }); + auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto segment_ids = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 4, 1, 1, 1 } }); + tensor output_shape = {3, 2, 3, 2}; + + /* + * [ 5 + * [ 2 + * [ 3 + * [ -0.2, 1.3 ], [ 0.5, -0.3 ], [ 0.4, -0.4 ] + * ], + * [ 3 + * [ -0.1, 1.0 ], [ 2.1, 0.7 ], [ -0.2, -0.7 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 1.9, -2.4 ], [ 3.4, -0.7 ], [ -0.4, 0.5 ] + * ], + * [ 3 + * [ 2.3, 1.3 ], [ -0.4, -0.7 ], [ 1.8, -0.9 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 1.5, -2.4 ], [ 4.2, 3.2 ], [ -0.6, 0.9 ] + * ], + * [ 3 + * [ 3.3, -4.1 ], [ 2.1, 0.8 ], [ 5.2, -2.5 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 0.8, -1.9 ], [ 0.7, 3.4 ], [ -3.3, 0.1 ] + * ], + * [ 3 + * [ 3.5, -5.7 ], [ -0.1, 0.3 ], [ 0.4, 3.3 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 6.1, 8.3 ], [ 0.4, -4.4 ], [ -5.2, 0.9 ] + * ], + * [ 3 + * [ 0.3, 1.0 ], [ 2.3, -4.1 ], [ 2.0, -5.7 ] + * ], + * ] + * ] + */ + set_values(emb_table, { + FLOAT16(-0.2f), FLOAT16( 1.3f), FLOAT16( 0.5f), FLOAT16(-0.3f), FLOAT16( 0.4f), FLOAT16(-0.4f), + FLOAT16(-0.1f), FLOAT16( 1.0f), FLOAT16( 2.1f), FLOAT16( 0.7f), FLOAT16(-0.2f), FLOAT16(-0.7f), + FLOAT16( 1.9f), FLOAT16(-2.4f), FLOAT16( 3.4f), FLOAT16(-0.7f), FLOAT16(-0.4f), FLOAT16( 0.5f), + FLOAT16( 2.3f), FLOAT16( 1.3f), FLOAT16(-0.4f), FLOAT16(-0.7f), FLOAT16( 1.8f), FLOAT16(-0.9f), + FLOAT16( 1.5f), FLOAT16(-2.4f), FLOAT16( 4.2f), FLOAT16( 3.2f), FLOAT16(-0.6f), FLOAT16( 0.9f), + FLOAT16( 3.3f), FLOAT16(-4.1f), FLOAT16( 2.1f), FLOAT16( 0.8f), FLOAT16( 5.2f), FLOAT16(-2.5f), + FLOAT16( 0.8f), FLOAT16(-1.9f), FLOAT16( 0.7f), FLOAT16( 3.4f), FLOAT16(-3.3f), FLOAT16( 0.1f), + FLOAT16( 3.5f), FLOAT16(-5.7f), FLOAT16(-0.1f), FLOAT16( 0.3f), FLOAT16( 0.4f), FLOAT16( 3.3f), + FLOAT16( 6.1f), FLOAT16( 8.3f), FLOAT16( 0.4f), FLOAT16(-4.4f), FLOAT16(-5.2f), FLOAT16( 0.9f), + FLOAT16( 0.3f), FLOAT16( 1.0f), FLOAT16( 2.3f), FLOAT16(-4.1f), FLOAT16( 2.0f), FLOAT16(-5.7f) + }); + set_values(indices, { + 0, 2, 3, 4 + }); + set_values(segment_ids, { + 0, 0, 2, 2 + }); + set_values(per_sample_weights, { + FLOAT16(0.5f), FLOAT16(0.5f), + FLOAT16(0.5f), FLOAT16(0.5f) + }); + + auto type = embedding_bag::segments_sum; + topology topology; + topology.add(input_layout("Input0", emb_table.get_layout())); + topology.add(input_layout("Input1", indices.get_layout())); + topology.add(input_layout("Input2", segment_ids.get_layout())); + topology.add(data("Input3", per_sample_weights)); + topology.add( + embedding_bag("embedding_bag", {"Input0", "Input1", "Input2", "Input3"}, type, output_shape, 0) + ); + + network network(engine, topology); + + network.set_input_data("Input0", emb_table); + network.set_input_data("Input1", indices); + network.set_input_data("Input2", segment_ids); + + auto outputs = network.execute(); + + auto output = outputs.at("embedding_bag").get_memory(); + auto output_ptr = output.pointer(); + + /* + * [ 3 + * [ 2 + * [ 3 + * [ 0.65, -0.55 ], [ 2.35, 1.45 ], [ -0.1, 0.25 ] + * ], + * [ 3 + * [ 1.6, -1.55 ], [ 2.1, 0.75 ], [ 2.5, -1.6 ] + * ], + * ], + * [ 2 + * [ 3 + * [ -0.2, 1.3 ], [ 0.5, -0.3 ], [ 0.4, -0.4 ] + * ], + * [ 3 + * [ -0.1, 1.0 ], [ 2.1, 0.7 ], [ -0.2, -0.7 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 3.45, 3.2 ], [ 0.55, -0.5 ], [ -4.25, 0.5 ] + * ], + * [ 3 + * [ 1.9, -2.35 ], [ 1.1, -1.9 ], [ 1.2, -1.2 ] + * ], + * ] + * ] + */ + std::vector expected_results = { + 0.65f, -0.55f, 2.35f, 1.45f, -0.1f, 0.25f, + 1.6f, -1.55f, 2.1f, 0.75f, 2.5f, -1.6f, + -0.2f, 1.3f, 0.5f, -0.3f, 0.4f, -0.4f, + -0.1f, 1.0f, 2.1f, 0.7f, -0.2f, -0.7f, + 3.45f, 3.2f, 0.55f, -0.5f, -4.25f, 0.5f, + 1.9f, -2.35f, 1.1f, -1.9f, 1.2f, -1.2f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]), static_cast(1e-2))) << i; + } +} + +TEST(embedding_bag_fp32_gpu, packed_sum_basic) { + // emb_table : 5x2 + // indices : 3x2 + // per_sample_weights : 3x2 + // Output : 3x2 + // Input values in fp16 + const auto& engine = get_test_engine(); + + auto emb_table = memory::allocate(engine, { data_types::f32, format::bfyx, { 5, 2, 1, 1 } }); + auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 2, 1, 1 } }); + auto per_sample_weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 2, 1, 1 } }); + tensor output_shape = {3, 2, 1, 1}; + + set_values(emb_table, { + -0.2f, -0.6f, + -0.1f, -0.4f, + -1.9f, -1.8f, + -1.0f, 1.5f, + 0.8f, -0.7f + }); + set_values(indices, { + 0, 2, + 1, 2, + 3, 4 + }); + set_values(per_sample_weights, { + 0.5f, 0.5f, + 0.5f, 0.5f, + 0.5f, 0.5f + }); + + auto type = embedding_bag::packed_sum; + topology topology; + topology.add(input_layout("Input0", emb_table.get_layout())); + topology.add(input_layout("Input1", indices.get_layout())); + topology.add(data("Input2", per_sample_weights)); + topology.add( + embedding_bag("embedding_bag", {"Input0", "Input1", "Input2"}, type, output_shape) + ); + + network network(engine, topology); + + network.set_input_data("Input0", emb_table); + network.set_input_data("Input1", indices); + + auto outputs = network.execute(); + + auto output = outputs.at("embedding_bag").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + -1.05f, -1.2f, + -1.f, -1.1f, + -0.1f, 0.4f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_TRUE(are_equal(expected_results[i], output_ptr[i])) << i; + } +} + +TEST(embedding_bag_fp32_gpu, packed_sum_dim3) { + // emb_table : 5x2x3x2 + // indices : 3x2 + // per_sample_weights : 3x2 + // Output : 3x2x3x2 + // Input values in fp16 + const auto& engine = get_test_engine(); + + auto emb_table = memory::allocate(engine, { data_types::f32, format::bfyx, { 5, 2, 3, 2 } }); + auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 2, 1, 1 } }); + auto per_sample_weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 2, 1, 1 } }); + tensor output_shape = {3, 2, 3, 2}; + + /* + * [ 5 + * [ 2 + * [ 3 + * [ -0.2, 1.3 ], [ 0.5, -0.3 ], [ 0.4, -0.4 ] + * ], + * [ 3 + * [ -0.1, 1.0 ], [ 2.1, 0.7 ], [ -0.2, -0.7 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 1.9, -2.4 ], [ 3.4, -0.7 ], [ -0.4, 0.5 ] + * ], + * [ 3 + * [ 2.3, 1.3 ], [ -0.4, -0.7 ], [ 1.8, -0.9 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 1.5, -2.4 ], [ 4.2, 3.2 ], [ -0.6, 0.9 ] + * ], + * [ 3 + * [ 3.3, -4.1 ], [ 2.1, 0.8 ], [ 5.2, -2.5 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 0.8, -1.9 ], [ 0.7, 3.4 ], [ -3.3, 0.1 ] + * ], + * [ 3 + * [ 3.5, -5.7 ], [ -0.1, 0.3 ], [ 0.4, 3.3 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 6.1, 8.3 ], [ 0.4, -4.4 ], [ -5.2, 0.9 ] + * ], + * [ 3 + * [ 0.3, 1.0 ], [ 2.3, -4.1 ], [ 2.0, -5.7 ] + * ], + * ] + * ] + */ + set_values(emb_table, { + -0.2f, 1.3f, 0.5f, -0.3f, 0.4f, -0.4f, + -0.1f, 1.0f, 2.1f, 0.7f, -0.2f, -0.7f, + 1.9f, -2.4f, 3.4f, -0.7f, -0.4f, 0.5f, + 2.3f, 1.3f, -0.4f, -0.7f, 1.8f, -0.9f, + 1.5f, -2.4f, 4.2f, 3.2f, -0.6f, 0.9f, + 3.3f, -4.1f, 2.1f, 0.8f, 5.2f, -2.5f, + 0.8f, -1.9f, 0.7f, 3.4f, -3.3f, 0.1f, + 3.5f, -5.7f, -0.1f, 0.3f, 0.4f, 3.3f, + 6.1f, 8.3f, 0.4f, -4.4f, -5.2f, 0.9f, + 0.3f, 1.0f, 2.3f, -4.1f, 2.0f, -5.7f + }); + set_values(indices, { + 0, 2, + 1, 2, + 3, 4 + }); + set_values(per_sample_weights, { + 0.5f, 0.5f, + 0.5f, 0.5f, + 0.5f, 0.5f + }); + + auto type = embedding_bag::packed_sum; + topology topology; + topology.add(input_layout("Input0", emb_table.get_layout())); + topology.add(input_layout("Input1", indices.get_layout())); + topology.add(data("Input2", per_sample_weights)); + topology.add( + embedding_bag("embedding_bag", {"Input0", "Input1", "Input2"}, type, output_shape) + ); + + network network(engine, topology); + + network.set_input_data("Input0", emb_table); + network.set_input_data("Input1", indices); + + auto outputs = network.execute(); + + auto output = outputs.at("embedding_bag").get_memory(); + auto output_ptr = output.pointer(); + + /* + * [ 3 + * [ 2 + * [ 3 + * [ 0.65, -0.55 ], [ 2.35, 1.45 ], [ -0.1, 0.25 ] + * ], + * [ 3 + * [ 1.6, -1.55 ], [ 2.1, 0.75 ], [ 2.5, -1.6 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 1.7, -2.4 ], [ 3.8, 1.25 ], [ -0.5, 0.7 ] + * ], + * [ 3 + * [ 2.8, -1.4 ], [ 0.85, 0.05 ], [ 3.5, -1.7 ] + * ], + * ], + * [ 2 + * [ 3 + * [ 3.45, 3.2 ], [ 0.55, -0.5 ], [ -4.25, 0.5 ] + * ], + * [ 3 + * [ 1.9, -2.35 ], [ 1.1, -1.9 ], [ 1.2, -1.2 ] + * ], + * ] + * ] + */ + std::vector expected_results = { + 0.65f, -0.55f, 2.35f, 1.45f, -0.1f, 0.25f, + 1.6f, -1.55f, 2.1f, 0.75f, 2.5f, -1.6f, + 1.7f, -2.4f, 3.8f, 1.25f, -0.5f, 0.7f, + 2.8f, -1.4f, 0.85f, 0.05f, 3.5f, -1.7f, + 3.45f, 3.2f, 0.55f, -0.5f, -4.25f, 0.5f, + 1.9f, -2.35f, 1.1f, -1.9f, 1.2f, -1.2f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_TRUE(are_equal(expected_results[i], output_ptr[i])) << i; + } +} + +TEST(embedding_bag_fp32_gpu, extended5_6) { + // emb_table : 5x2 + // indices : 3x2 + // per_sample_weights : 3x2 + // Output : 3x2 + // Input values in fp16 + const auto& engine = get_test_engine(); + + auto emb_table = memory::allocate(engine, { data_types::f32, format::bfyx, { 5, 6, 1, 1 } }); + auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 5, 1, 1, 1 } }); + auto segment_ids = memory::allocate(engine, { data_types::i32, format::bfyx, { 5, 1, 1, 1 } }); + tensor output_shape = {5, 6, 1, 1}; + + set_values(emb_table, { + 0.f, 1.f, 8.f, 5.f, 5.f, 2.f, + 0.f, 7.f, 7.f, 10.f, 4.f, 5.f, + 9.f, 0.f, 0.f, 5.f, 7.f, 0.f, + 4.f, 0.f, 4.f, 7.f, 6.f, 10.f, + 9.f, 5.f, 1.f, 7.f, 4.f, 7.f + }); + set_values(indices, { 0, 1, 2, 2, 3 }); + set_values(segment_ids, { 0, 0, 2, 2, 4 }); + + auto type = embedding_bag::segments_sum; + topology topology; + topology.add(input_layout("Input0", emb_table.get_layout())); + topology.add(input_layout("Input1", indices.get_layout())); + topology.add(input_layout("Input2", segment_ids.get_layout())); + topology.add( + embedding_bag("embedding_bag", {"Input0", "Input1", "Input2"}, type, output_shape) + ); + + network network(engine, topology); + + network.set_input_data("Input0", emb_table); + network.set_input_data("Input1", indices); + network.set_input_data("Input2", segment_ids); + + auto outputs = network.execute(); + + auto output = outputs.at("embedding_bag").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0, 8, 15, 15, 9, 7, + 0, 0, 0, 0, 0, 0, + 18, 0, 0, 10, 14, 0, + 0, 0, 0, 0, 0, 0, + 4, 0, 4, 7, 6, 10, + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_TRUE(are_equal(expected_results[i], output_ptr[i])) << i; + } +}