diff --git a/inference-engine/src/cldnn_engine/cldnn_program.cpp b/inference-engine/src/cldnn_engine/cldnn_program.cpp
index 86cdddb81ef..d8a15fa4174 100644
--- a/inference-engine/src/cldnn_engine/cldnn_program.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_program.cpp
@@ -63,6 +63,7 @@
 #include <api/grn.hpp>
 #include <api/ctc_greedy_decoder.hpp>
 #include <api/cum_sum.hpp>
+#include <api/embedding_bag.hpp>
 
 #include <chrono>
 #include <cmath>
@@ -598,6 +599,9 @@ Program::LayerType Program::LayerTypeFromStr(const std::string &str) {
         { "CTCGreedyDecoder", CTCGreedyDecoder },
         { "PriorBoxClustered", PriorBoxClustered },
         { "CumSum", CumSum },
+        { "EmbeddingBagPackedSum", EmbeddingBagPackedSum },
+        { "EmbeddingBagOffsetsSum", EmbeddingBagOffsetsSum },
+        { "EmbeddingSegmentsSum", EmbeddingSegmentsSum },
     };
     auto it = LayerNameToType.find(str);
     if (it != LayerNameToType.end())
@@ -1280,6 +1284,12 @@ void Program::CreateSingleLayerPrimitive(cldnn::topology& topology, InferenceEng
             break;
         case CumSum: CreateCumSumPrimitive(topology, layer);
             break;
+        case EmbeddingBagPackedSum: CreateEmbeddingBagPackedSumPrimitive(topology, layer);
+            break;
+        case EmbeddingBagOffsetsSum: CreateEmbeddingBagOffsetsSumPrimitive(topology, layer);
+            break;
+        case EmbeddingSegmentsSum: CreateEmbeddingSegmentsSumPrimitive(topology, layer);
+            break;
         default: THROW_CLDNN_EXCEPTION("Unknown Layer Type: " << layer->type);
     }
 }
@@ -4487,6 +4497,95 @@ void Program::CreatePriorBoxClusteredPrimitive(cldnn::topology& topology, Infere
     AddPrimitiveToProfiler(priorBoxLayerName, layer);
 }
 
+void Program::CreateEmbeddingBagPackedSumPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr& layer) {
+    ValidateLayer(layer, {2, 3});
+
+    auto inputPrimitives = GetPrevLayersPrimitives(layer);
+    auto embeddingBag = as<InferenceEngine::GenericLayer*>(layer);
+
+    auto layerName = layer_type_name_ID(layer);
+    auto embeddingBagPrim = cldnn::embedding_bag(
+            layerName,
+            inputPrimitives,
+            cldnn::embedding_bag::packed_sum,
+            CldnnTensorFromIEDims(embeddingBag->outData[0]->getTensorDesc().getDims()));
+
+    topology.add(embeddingBagPrim);
+    AddPrimitiveToProfiler(layerName, layer);
+}
+
+void Program::CreateEmbeddingBagOffsetsSumPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr& layer) {
+    ValidateLayer(layer, {3, 4, 5});
+
+    auto inputPrimitives = GetPrevLayersPrimitives(layer);
+    auto embeddingBag = as<InferenceEngine::GenericLayer*>(layer);
+
+    int32_t defaultIndex = -1;
+    if (inputPrimitives.size() > 3) {
+        auto defaultIndexInput = layer->insData[3].lock();
+        auto defaultIndexInputCreator = defaultIndexInput->getCreatorLayer().lock();
+        if (defaultIndexInputCreator->blobs.size() == 1) {
+            auto constantBlob = defaultIndexInputCreator->blobs.begin()->second;
+            auto defaultIndexPrecision = constantBlob->getTensorDesc().getPrecision();
+            if (defaultIndexPrecision == InferenceEngine::Precision::I32) {
+                auto data = constantBlob->buffer().as<int32_t*>();
+                defaultIndex = data[0];
+            } else {
+                THROW_IE_EXCEPTION << layer->name << "Incorrect EmbeddingBagOfsetsSum default_index precision";
+            }
+        }
+        inputPrimitives.erase(inputPrimitives.begin() + 3); // Remove "default_index"
+    }
+
+    auto layerName = layer_type_name_ID(layer);
+    auto embeddingBagPrim = cldnn::embedding_bag(
+            layerName,
+            inputPrimitives,
+            cldnn::embedding_bag::offsets_sum,
+            CldnnTensorFromIEDims(embeddingBag->outData[0]->getTensorDesc().getDims()),
+            defaultIndex);
+
+    topology.add(embeddingBagPrim);
+    AddPrimitiveToProfiler(layerName, layer);
+}
+
+void Program::CreateEmbeddingSegmentsSumPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr& layer) {
+    ValidateLayer(layer, {4, 5, 6});
+
+    auto inputPrimitives = GetPrevLayersPrimitives(layer);
+    auto embeddingBag = as<InferenceEngine::GenericLayer*>(layer);
+
+    inputPrimitives.erase(inputPrimitives.begin() + 3); // Remove "num_segments"
+
+    int32_t defaultIndex = -1;
+    if (inputPrimitives.size() > 3) {
+        auto defaultIndexInput = layer->insData[4].lock();
+        auto defaultIndexInputCreator = defaultIndexInput->getCreatorLayer().lock();
+        if (defaultIndexInputCreator->blobs.size() == 1) {
+            auto constantBlob = defaultIndexInputCreator->blobs.begin()->second;
+            auto defaultIndexPrecision = constantBlob->getTensorDesc().getPrecision();
+            if (defaultIndexPrecision == InferenceEngine::Precision::I32) {
+                auto data = constantBlob->buffer().as<int32_t*>();
+                defaultIndex = data[0];
+            } else {
+                THROW_IE_EXCEPTION << layer->name << "Incorrect EmbeddingBagOfsetsSum default_index precision";
+            }
+        }
+        inputPrimitives.erase(inputPrimitives.begin() + 3); // Remove "default_index"
+    }
+
+    auto layerName = layer_type_name_ID(layer);
+    auto embeddingBagPrim = cldnn::embedding_bag(
+            layerName,
+            inputPrimitives,
+            cldnn::embedding_bag::segments_sum,
+            CldnnTensorFromIEDims(embeddingBag->outData[0]->getTensorDesc().getDims()),
+            defaultIndex);
+
+    topology.add(embeddingBagPrim);
+    AddPrimitiveToProfiler(layerName, layer);
+}
+
 bool Program::IsValidSplitConvMerge(const InferenceEngine::SplitLayer *splitLayer) const {
     if (splitLayer->outData.size() != 2) return false;  // split into 2
 
diff --git a/inference-engine/src/cldnn_engine/cldnn_program.h b/inference-engine/src/cldnn_engine/cldnn_program.h
index 18b6150f3cd..588d7dd36a8 100644
--- a/inference-engine/src/cldnn_engine/cldnn_program.h
+++ b/inference-engine/src/cldnn_engine/cldnn_program.h
@@ -217,6 +217,9 @@ public:
         CTCGreedyDecoder,
         PriorBoxClustered,
         CumSum,
+        EmbeddingBagPackedSum,
+        EmbeddingBagOffsetsSum,
+        EmbeddingSegmentsSum,
         NO_TYPE
     };
     using GenericBlobMap = std::map<cldnn::primitive_id, cldnn::primitive_id>;
@@ -373,6 +376,9 @@ private:
     void CreateCTCGreedyDecoderPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr& layer);
     void CreatePriorBoxClusteredPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr& layer);
     void CreateCumSumPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr& layer);
+    void CreateEmbeddingBagPackedSumPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr& layer);
+    void CreateEmbeddingBagOffsetsSumPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr& layer);
+    void CreateEmbeddingSegmentsSumPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr& layer);
 };
 
 }  // namespace CLDNNPlugin
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/embedding_bag_offsets_sum.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/embedding_bag_offsets_sum.cpp
new file mode 100644
index 00000000000..0557fb59fa3
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/embedding_bag_offsets_sum.cpp
@@ -0,0 +1,59 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "single_layer_tests/embedding_bag_offsets_sum.hpp"
+
+#include <vector>
+
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+    InferenceEngine::Precision::FP32,
+    InferenceEngine::Precision::FP16
+};
+
+const std::vector<InferenceEngine::Precision> indPrecisions = {
+    InferenceEngine::Precision::I64,
+    InferenceEngine::Precision::I32
+};
+
+const std::vector<std::vector<size_t>> emb_table_shape = {
+    {5, 6},
+    {10, 35},
+    {5, 4, 16}
+};
+
+const std::vector<std::vector<size_t>> indices = {
+    {0, 1, 2, 2, 3},
+    {4, 4, 3, 1, 0},
+    {1, 2, 1, 2, 1, 2, 1, 2, 1, 2}
+};
+
+const std::vector<std::vector<size_t>> offsets = {
+    {0, 2},
+    {0, 0, 2, 2},
+    {2, 4}
+};
+
+const std::vector<size_t> default_index = {0, 4};
+const std::vector<bool> with_weights = {false, true};
+const std::vector<bool> with_default_index = {false, true};
+
+const auto embBagOffsetSumArgSet = ::testing::Combine(
+    ::testing::ValuesIn(emb_table_shape), ::testing::ValuesIn(indices),
+    ::testing::ValuesIn(offsets), ::testing::ValuesIn(default_index),
+    ::testing::ValuesIn(with_weights), ::testing::ValuesIn(with_default_index));
+
+INSTANTIATE_TEST_CASE_P(
+    smoke, EmbeddingBagOffsetsSumLayerTest,
+    ::testing::Combine(embBagOffsetSumArgSet,
+                       ::testing::ValuesIn(netPrecisions),
+                       ::testing::ValuesIn(indPrecisions),
+                       ::testing::Values(CommonTestUtils::DEVICE_GPU)),
+    EmbeddingBagOffsetsSumLayerTest::getTestCaseName);
+}  // namespace
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/embedding_bag_packed_sum.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/embedding_bag_packed_sum.cpp
new file mode 100644
index 00000000000..c2aa5ef2d58
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/embedding_bag_packed_sum.cpp
@@ -0,0 +1,49 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "single_layer_tests/embedding_bag_packed_sum.hpp"
+
+#include <vector>
+
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+    InferenceEngine::Precision::FP32,
+    InferenceEngine::Precision::FP16,
+};
+
+const std::vector<InferenceEngine::Precision> indPrecisions = {
+    InferenceEngine::Precision::I64,
+    InferenceEngine::Precision::I32
+};
+
+const std::vector<std::vector<size_t>> emb_table_shape = {
+    {5, 6},
+    {10, 35},
+    {5, 4, 16}
+};
+
+const std::vector<std::vector<std::vector<size_t>>> indices = {
+    {{0, 1}, {2, 2}, {3, 4}},
+    {{4, 4, 3}, {1, 0, 2}},
+    {{1, 2, 1, 2}, {1, 2, 1, 2}}
+};
+const std::vector<bool> with_weights = {false, true};
+
+const auto embBagPackedSumArgSet = ::testing::Combine(
+    ::testing::ValuesIn(emb_table_shape), ::testing::ValuesIn(indices),
+    ::testing::ValuesIn(with_weights));
+
+INSTANTIATE_TEST_CASE_P(
+    smoke, EmbeddingBagPackedSumLayerTest,
+    ::testing::Combine(embBagPackedSumArgSet,
+                       ::testing::ValuesIn(netPrecisions),
+                       ::testing::ValuesIn(indPrecisions),
+                       ::testing::Values(CommonTestUtils::DEVICE_GPU)),
+    EmbeddingBagPackedSumLayerTest::getTestCaseName);
+}  // namespace
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/embedding_segments_sum.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/embedding_segments_sum.cpp
new file mode 100644
index 00000000000..14b5f7133b3
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/embedding_segments_sum.cpp
@@ -0,0 +1,54 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "single_layer_tests/embedding_segments_sum.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+    InferenceEngine::Precision::FP32,
+    InferenceEngine::Precision::FP16
+};
+
+const std::vector<InferenceEngine::Precision> indPrecisions = {
+    InferenceEngine::Precision::I64,
+    InferenceEngine::Precision::I32
+};
+
+const std::vector<std::vector<size_t>> emb_table_shape = {
+    {5, 6},
+    {10, 35},
+    {5, 4, 16}
+};
+const std::vector<std::vector<size_t>> indices = {
+    {0, 1, 2, 2, 3},
+    {4, 4, 3, 1, 2}
+};
+const std::vector<std::vector<size_t>> segment_ids = {
+    {0, 1, 2, 3, 4},
+    {0, 0, 2, 2, 4}
+};
+const std::vector<size_t> num_segments = {5, 7};
+const std::vector<size_t> default_index = {0, 4};
+const std::vector<bool> with_weights = {false, true};
+const std::vector<bool> with_default_index = {false, true};
+
+const auto embSegmentsSumArgSet = ::testing::Combine(
+    ::testing::ValuesIn(emb_table_shape), ::testing::ValuesIn(indices),
+    ::testing::ValuesIn(segment_ids), ::testing::ValuesIn(num_segments),
+    ::testing::ValuesIn(default_index), ::testing::ValuesIn(with_weights),
+    ::testing::ValuesIn(with_default_index));
+
+INSTANTIATE_TEST_CASE_P(
+    smoke, EmbeddingSegmentsSumLayerTest,
+    ::testing::Combine(embSegmentsSumArgSet, ::testing::ValuesIn(netPrecisions),
+                       ::testing::ValuesIn(indPrecisions),
+                       ::testing::Values(CommonTestUtils::DEVICE_GPU)),
+    EmbeddingSegmentsSumLayerTest::getTestCaseName);
+}  // namespace
diff --git a/inference-engine/thirdparty/clDNN/api/embedding_bag.hpp b/inference-engine/thirdparty/clDNN/api/embedding_bag.hpp
new file mode 100644
index 00000000000..2dba4f4eeb9
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/api/embedding_bag.hpp
@@ -0,0 +1,64 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#pragma once
+#include "primitive.hpp"
+
+namespace cldnn {
+/// @addtogroup cpp_api C++ API
+/// @{
+/// @addtogroup cpp_topology Network Topology
+/// @{
+/// @addtogroup cpp_primitives Primitives
+/// @{
+
+/// @brief Computes sums of "bags" of embeddings, without instantiating the intermediate embeddings.
+/// @details For each index in `indices` this operator gets values from `data` embedding table and sums all values belonging to each bag.
+struct embedding_bag : public primitive_base<embedding_bag> {
+    CLDNN_DECLARE_PRIMITIVE(embedding_bag)
+
+    /// @brief Select type of embedding_bag operation
+    enum embedding_bag_type {
+        packed_sum,
+        offsets_sum,
+        segments_sum
+    };
+
+    /// @brief Constructs embedding_bag primitive.
+    /// @param id This primitive id.
+    /// @param inputs Vector with different inputs.
+    /// @param output_shape Tensor with shape of output layout
+    /// @param default_index default index in embedding table to fill empty "bags"
+    embedding_bag(const primitive_id& id,
+                  const std::vector<primitive_id>& inputs,
+                  const embedding_bag_type& type,
+                  const tensor& output_shape,
+                  const int32_t default_index = -1,
+                  const padding& output_padding = padding())
+        : primitive_base(id, inputs, output_padding), type(type), output_shape(output_shape), default_index(default_index) {}
+
+    /// @brief Type of EmbeddingBag operation
+    embedding_bag_type type;
+    /// @brief Shape of output layout
+    tensor output_shape;
+    /// @brief Default index
+    int32_t default_index;
+};
+/// @}
+/// @}
+/// @}
+}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h
index 0dff12d9e2b..ab1ee211ff6 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h
@@ -82,7 +82,8 @@ enum class KernelType {
     SPACE_TO_DEPTH,
     GRN,
     CTC_GREEDY_DECODER,
-    CUM_SUM
+    CUM_SUM,
+    EMBEDDING_BAG
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -532,4 +533,13 @@ enum class CumSumAxis {
     FEATURE,
     BATCH
 };
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// EmbeddingBagType
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+enum class EmbeddingBagType {
+    PACKED_SUM,
+    OFFSETS_SUM,
+    SEGMENTS_SUM
+};
 }  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.cpp
new file mode 100644
index 00000000000..333ecc2a212
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.cpp
@@ -0,0 +1,140 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "embedding_bag_kernel_ref.h"
+#include "kernel_selector_utils.h"
+#include <string>
+#include <vector>
+
+namespace kernel_selector {
+JitConstants EmbeddingBagKernelRef::GetJitConstants(const embedding_bag_params& params) const {
+    JitConstants jit = MakeBaseParamsJitConstants(params);
+
+    switch (params.type) {
+    case EmbeddingBagType::PACKED_SUM:
+        jit.AddConstant(MakeJitConstant("PACKED_SUM", 1));
+        break;
+    case EmbeddingBagType::OFFSETS_SUM:
+        jit.AddConstant(MakeJitConstant("OFFSETS_SUM", 1));
+        break;
+    case EmbeddingBagType::SEGMENTS_SUM:
+        jit.AddConstant(MakeJitConstant("SEGMENTS_SUM", 1));
+        break;
+    default:
+        break;
+    }
+    if (params.default_index > -1)
+        jit.AddConstant(MakeJitConstant("DEFAULT_INDEX", params.default_index));
+
+    return jit;
+}
+
+CommonDispatchData EmbeddingBagKernelRef::SetDefault(const embedding_bag_params& params) const {
+    CommonDispatchData runInfo;
+
+    std::vector<size_t> global = { params.output.Batch().v,
+                                   params.output.Feature().v,
+                                   params.output.Y().v * params.output.X().v };
+
+    auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+
+    runInfo.gws0 = global[0];
+    runInfo.gws1 = global[1];
+    runInfo.gws2 = global[2];
+
+    runInfo.lws0 = local[0];
+    runInfo.lws1 = local[1];
+    runInfo.lws2 = local[2];
+
+    return runInfo;
+}
+
+KernelsData EmbeddingBagKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
+    KernelData kd = KernelData::Default<embedding_bag_params>(params);
+    embedding_bag_params& newParams = *static_cast<embedding_bag_params*>(kd.params.get());
+
+    if (!Validate(params, options)) {
+        return {};
+    }
+
+    auto runInfo = SetDefault(newParams);
+    auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
+    auto cldnn_jit = GetJitConstants(newParams);
+    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+
+    auto& kernel = kd.kernels[0];
+
+    FillCLKernelData(kernel,
+            runInfo,
+            params.engineInfo,
+            kernelName,
+            jit,
+            entry_point,
+            "",
+            false,
+            false,
+            (uint32_t)newParams.inputs.size());
+
+    kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE;
+
+    return { kd };
+}
+
+ParamsKey EmbeddingBagKernelRef::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableInputDataType(Datatype::F32);
+    k.EnableInputDataType(Datatype::INT32);
+    k.EnableInputDataType(Datatype::INT64);
+    k.EnableInputDataType(Datatype::UINT32);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
+
+    k.EnableInputLayout(DataLayout::bfxy);
+
+    k.EnableAllInputLayout();
+    k.EnableAllOutputLayout();
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBatching();
+    k.EnableDifferentTypes();
+    return k;
+}
+
+bool EmbeddingBagKernelRef::Validate(const Params& p, const optional_params& o) const {
+    if (p.GetType() != KernelType::EMBEDDING_BAG ||
+        o.GetType() != KernelType::EMBEDDING_BAG) {
+        return false;
+    }
+    const embedding_bag_params& params = static_cast<const embedding_bag_params&>(p);
+
+    auto checkIntType = [](Datatype dt) {
+        if (dt != Datatype::INT32 && dt != Datatype::UINT32)
+            return false;
+        return true;
+    };
+
+    if (!checkIntType(params.inputs[1].GetDType()))
+        return false;
+
+    if (params.type == EmbeddingBagType::OFFSETS_SUM || params.type == EmbeddingBagType::SEGMENTS_SUM) {
+        if (!checkIntType(params.inputs[2].GetDType()))
+            return false;
+    }
+
+    return true;
+}
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.h
new file mode 100644
index 00000000000..3333d79ec26
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.h
@@ -0,0 +1,58 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "common_kernel_base.h"
+#include "kernel_selector_params.h"
+
+namespace kernel_selector {
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// embedding_bag_params
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct embedding_bag_params : public base_params {
+    embedding_bag_params() : base_params(KernelType::EMBEDDING_BAG), type(EmbeddingBagType::PACKED_SUM), default_index(-1) {}
+
+    EmbeddingBagType type;
+    int32_t default_index;
+
+    virtual ParamsKey GetParamsKey() const { return base_params::GetParamsKey(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// embedding_bag_optional_params
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct embedding_bag_optional_params : optional_params {
+    embedding_bag_optional_params() : optional_params(KernelType::EMBEDDING_BAG) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// EmbeddingBagKernelRef
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+class EmbeddingBagKernelRef : public common_kernel_base {
+public:
+    EmbeddingBagKernelRef() : common_kernel_base("embedding_bag_ref") {}
+    virtual ~EmbeddingBagKernelRef() = default;
+
+protected:
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+
+    virtual JitConstants GetJitConstants(const embedding_bag_params& params) const;
+    virtual CommonDispatchData SetDefault(const embedding_bag_params& params) const;
+    virtual bool Validate(const Params&, const optional_params&) const override;
+};
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_selector.cpp
new file mode 100644
index 00000000000..14f8253f819
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_selector.cpp
@@ -0,0 +1,29 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "embedding_bag_kernel_selector.h"
+#include "embedding_bag_kernel_ref.h"
+
+namespace kernel_selector {
+
+embedding_bag_kernel_selector::embedding_bag_kernel_selector() {
+    Attach<EmbeddingBagKernelRef>();
+}
+
+KernelsData embedding_bag_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
+    return GetNaiveBestKernel(params, options, KernelType::EMBEDDING_BAG);
+}
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_selector.h
new file mode 100644
index 00000000000..a5172e78535
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_selector.h
@@ -0,0 +1,35 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "kernel_selector.h"
+
+namespace kernel_selector {
+class embedding_bag_kernel_selector : public kernel_selector_base {
+public:
+    static embedding_bag_kernel_selector& Instance() {
+        static embedding_bag_kernel_selector instance_;
+        return instance_;
+    }
+
+    embedding_bag_kernel_selector();
+
+    virtual ~embedding_bag_kernel_selector() {}
+
+    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
+};
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/embedding_bag_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/embedding_bag_ref.cl
new file mode 100644
index 00000000000..4410bf11349
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/embedding_bag_ref.cl
@@ -0,0 +1,148 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "include/include_all.cl"
+
+#ifdef PACKED_SUM
+KERNEL(embedding_bag_ref)(
+        const __global INPUT0_TYPE* emb_table,
+        const __global INPUT1_TYPE* indices,
+#ifdef INPUT2_TYPE
+        const __global INPUT2_TYPE* weights,
+#endif
+        __global OUTPUT_TYPE* output)
+{
+    const uint batch = get_global_id(0);
+    const uint emb_dim1 = get_global_id(1);
+    const uint emb_dim2 = (uint)get_global_id(2) / OUTPUT_SIZE_X;
+    const uint emb_dim3 = (uint)get_global_id(2) % OUTPUT_SIZE_X;
+
+    OUTPUT_TYPE res = OUTPUT_VAL_ZERO;
+    for (int i = 0; i < INPUT1_FEATURE_NUM; ++i)
+    {
+        uint indices_index = INPUT1_GET_INDEX(batch, i, 0, 0);
+        uint emb_index = INPUT0_GET_INDEX(indices[indices_index], emb_dim1, emb_dim2, emb_dim3);
+        OUTPUT_TYPE val = emb_table[emb_index];
+#ifdef INPUT2_TYPE
+        {
+            uint weight_index = INPUT2_GET_INDEX(batch, i, 0, 0);
+            val *= weights[weight_index];
+        }
+#endif
+        res += val;
+    }
+    uint out_ind = OUTPUT_GET_INDEX(batch, emb_dim1, emb_dim2, emb_dim3);
+    output[out_ind] = ACTIVATION(TO_OUTPUT_TYPE(res), ACTIVATION_PARAMS);
+}
+#endif // PACKED_SUM
+
+#ifdef OFFSETS_SUM
+KERNEL(embedding_bag_ref)(
+        const __global INPUT0_TYPE* emb_table,
+        const __global INPUT1_TYPE* indices,
+        const __global INPUT2_TYPE* offsets,
+#ifdef INPUT3_TYPE
+        const __global INPUT3_TYPE* weights,
+#endif
+        __global OUTPUT_TYPE* output)
+{
+    const uint batch = get_global_id(0);
+    const uint emb_dim1 = get_global_id(1);
+    const uint emb_dim2 = (uint)get_global_id(2) / OUTPUT_SIZE_X;
+    const uint emb_dim3 = (uint)get_global_id(2) % OUTPUT_SIZE_X;
+
+    uint offsets_ind = INPUT2_OFFSET + batch;
+    uint start_indices = INPUT1_OFFSET + offsets[offsets_ind];
+    offsets_ind = INPUT2_OFFSET + batch + 1;
+    uint end_indices = (batch < OUTPUT_BATCH_NUM - 1) ?
+                            INPUT1_OFFSET + offsets[offsets_ind] :
+                            INPUT1_LENGTH;
+
+    OUTPUT_TYPE res = OUTPUT_VAL_ZERO;
+    for (int i = start_indices; i < end_indices; ++i)
+    {
+        uint indices_index = INPUT1_OFFSET + i;
+        uint emb_index = INPUT0_GET_INDEX(indices[indices_index], emb_dim1, emb_dim2, emb_dim3);
+        OUTPUT_TYPE val = emb_table[emb_index];
+#ifdef INPUT3_TYPE
+        {
+            uint weight_index = INPUT3_OFFSET + i;
+            val *= weights[weight_index];
+        }
+#endif
+        res += val;
+    }
+
+#ifdef DEFAULT_INDEX
+    if (start_indices == end_indices) {
+        {
+            uint emb_index = INPUT0_GET_INDEX(DEFAULT_INDEX, emb_dim1, emb_dim2, emb_dim3);
+            res = emb_table[emb_index];
+        }
+    }
+#endif
+
+    uint out_ind = OUTPUT_GET_INDEX(batch, emb_dim1, emb_dim2, emb_dim3);
+    output[out_ind] = ACTIVATION(TO_OUTPUT_TYPE(res), ACTIVATION_PARAMS);
+}
+#endif // OFFSETS_SUM
+
+#ifdef SEGMENTS_SUM
+KERNEL(embedding_bag_ref)(
+        const __global INPUT0_TYPE* emb_table,
+        const __global INPUT1_TYPE* indices,
+        const __global INPUT2_TYPE* segment_ids,
+#ifdef INPUT3_TYPE
+        const __global INPUT3_TYPE* weights,
+#endif
+        __global OUTPUT_TYPE* output)
+{
+    const uint batch = get_global_id(0);
+    const uint emb_dim1 = get_global_id(1);
+    const uint emb_dim2 = (uint)get_global_id(2) / OUTPUT_SIZE_X;
+    const uint emb_dim3 = (uint)get_global_id(2) % OUTPUT_SIZE_X;
+
+    OUTPUT_TYPE res = OUTPUT_VAL_ZERO;
+    bool found = false;
+    for (int i = 0; i < INPUT2_LENGTH; ++i) {
+        uint id = segment_ids[INPUT2_OFFSET + i];
+        if (id > batch)
+            break;
+        if (id == batch) {
+            found = true;
+            uint index = indices[INPUT1_OFFSET + i];
+            uint emb_index = INPUT0_GET_INDEX(index, emb_dim1, emb_dim2, emb_dim3);
+            OUTPUT_TYPE val = emb_table[emb_index];
+#ifdef INPUT3_TYPE
+            {
+                uint weight_index = INPUT3_OFFSET + i;
+                val *= weights[weight_index];
+            }
+#endif
+            res += val;
+        }
+    }
+
+#ifdef DEFAULT_INDEX
+    if (!found) {
+        uint emb_index = INPUT0_GET_INDEX(DEFAULT_INDEX, emb_dim1, emb_dim2, emb_dim3);
+        res = emb_table[emb_index];
+    }
+#endif
+
+    uint out_ind = OUTPUT_GET_INDEX(batch, emb_dim1, emb_dim2, emb_dim3);
+    output[out_ind] = ACTIVATION(TO_OUTPUT_TYPE(res), ACTIVATION_PARAMS);
+}
+#endif // SEGMENTS_SUM
diff --git a/inference-engine/thirdparty/clDNN/src/embedding_bag.cpp b/inference-engine/thirdparty/clDNN/src/embedding_bag.cpp
new file mode 100644
index 00000000000..c5bf30b23f2
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/src/embedding_bag.cpp
@@ -0,0 +1,70 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "embedding_bag_inst.h"
+
+#include "primitive_type_base.h"
+#include "error_handler.h"
+#include "json_object.h"
+#include <string>
+
+namespace cldnn {
+primitive_type_id embedding_bag::type_id() {
+    static primitive_type_base<embedding_bag> instance;
+    return &instance;
+}
+
+layout embedding_bag_inst::calc_output_layout(embedding_bag_node const& node) {
+    auto desc = node.get_primitive();
+
+    auto input_layout = node.input(0).get_output_layout();
+    auto output_format = input_layout.format;
+
+    auto output_shape = desc->output_shape;
+
+    return layout(input_layout.data_type, output_format, output_shape);
+}
+
+std::string embedding_bag_inst::to_string(embedding_bag_node const& node) {
+    auto desc = node.get_primitive();
+    auto node_info = node.desc_to_json();
+    auto& input = node.input();
+
+    std::stringstream primitive_description;
+
+    json_composite embedding_bag_info;
+    embedding_bag_info.add("input id", input.id());
+    switch(desc->type) {
+    case embedding_bag::packed_sum:
+        embedding_bag_info.add("embedding bag type", "PackedSum");
+        break;
+    case embedding_bag::offsets_sum:
+        embedding_bag_info.add("embedding bag type", "OffsetsSum");
+        break;
+    case embedding_bag::segments_sum:
+        embedding_bag_info.add("embedding bag type", "SegmentsSum");
+        break;
+    }
+
+    node_info->add("embedding_bag info", embedding_bag_info);
+    node_info->dump(primitive_description);
+
+    return primitive_description.str();
+}
+
+embedding_bag_inst::typed_primitive_inst(network_impl& network, embedding_bag_node const& node)
+    : parent(network, node) {}
+}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/embedding_bag_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/embedding_bag_gpu.cpp
new file mode 100644
index 00000000000..8d96591fc13
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/src/gpu/embedding_bag_gpu.cpp
@@ -0,0 +1,85 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "embedding_bag_inst.h"
+#include "primitive_gpu_base.h"
+#include "implementation_map.h"
+#include "kernel_selector_helper.h"
+#include "embedding_bag/embedding_bag_kernel_selector.h"
+#include "embedding_bag/embedding_bag_kernel_ref.h"
+#include "error_handler.h"
+#include "data_inst.h"
+
+using namespace cldnn;
+
+namespace cldnn {
+namespace gpu {
+struct embedding_bag_gpu : typed_primitive_gpu_impl<embedding_bag> {
+    using parent = typed_primitive_gpu_impl<embedding_bag>;
+    using parent::parent;
+
+public:
+    static primitive_impl* create(const embedding_bag_node& arg) {
+        auto embedding_bag_params = get_default_params<kernel_selector::embedding_bag_params>(arg);
+        auto embedding_bag_optional_params =
+            get_default_optional_params<kernel_selector::embedding_bag_optional_params>(arg.get_program());
+
+        switch (arg.get_primitive()->type) {
+        case embedding_bag::packed_sum:
+            embedding_bag_params.type = kernel_selector::EmbeddingBagType::PACKED_SUM;
+            break;
+        case embedding_bag::offsets_sum:
+            embedding_bag_params.type = kernel_selector::EmbeddingBagType::OFFSETS_SUM;
+            break;
+        case embedding_bag::segments_sum:
+            embedding_bag_params.type = kernel_selector::EmbeddingBagType::SEGMENTS_SUM;
+            break;
+        default:
+            CLDNN_ERROR_MESSAGE(arg.id(), "Unknown EmbeddingBag type");
+            break;
+        }
+
+        for (size_t i = 1; i < arg.inputs_count(); i++) {
+            embedding_bag_params.inputs.push_back(convert_data_tensor(arg.input(i).get_output_layout()));
+        }
+
+        embedding_bag_params.default_index = arg.get_primitive()->default_index;
+
+        auto& kernel_selector = kernel_selector::embedding_bag_kernel_selector::Instance();
+        auto best_kernels = kernel_selector.GetBestKernels(embedding_bag_params, embedding_bag_optional_params);
+
+        CLDNN_ERROR_BOOL(arg.id(),
+                         "Best_kernel.empty()",
+                         best_kernels.empty(),
+                         "Cannot find a proper kernel with this arguments");
+
+        auto embedding_bag = new embedding_bag_gpu(arg, best_kernels[0]);
+
+        return embedding_bag;
+    }
+};
+
+namespace detail {
+
+attach_embedding_bag_gpu::attach_embedding_bag_gpu() {
+    auto val_fw = embedding_bag_gpu::create;
+    implementation_map<embedding_bag>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw);
+    implementation_map<embedding_bag>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw);
+}
+
+}  // namespace detail
+}  // namespace gpu
+}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.cpp
index 1eac3183a0c..032c6809491 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.cpp
@@ -98,6 +98,7 @@ void register_implementations_gpu() {
     REGISTER_GPU(grn);
     REGISTER_GPU(ctc_greedy_decoder);
     REGISTER_GPU(cum_sum);
+    REGISTER_GPU(embedding_bag);
 }
 
 }  // namespace gpu
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.hpp b/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.hpp
index 34e7db5be54..d79a1b136b3 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.hpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.hpp
@@ -175,6 +175,7 @@ REGISTER_GPU(non_max_suppression);
 REGISTER_GPU(grn);
 REGISTER_GPU(ctc_greedy_decoder);
 REGISTER_GPU(cum_sum);
+REGISTER_GPU(embedding_bag);
 
 #undef REGISTER_GPU
 
diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
index 591efacc8a7..ec5db1e8e68 100644
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
@@ -50,6 +50,7 @@
 #include "shuffle_channels_inst.h"
 #include "strided_slice_inst.h"
 #include "cum_sum_inst.h"
+#include "embedding_bag_inst.h"
 #include <vector>
 #include <list>
 #include <memory>
@@ -200,7 +201,8 @@ void prepare_primitive_fusing::fuse_activations(program_impl &p) {
                  !input.is_type<softmax>() && !input.is_type<resample>() && !input.is_type<mvn>() &&
                  !input.is_type<depth_to_space>() && !input.is_type<gather>() && !input.is_type<reverse_sequence>() &&
                  !input.is_type<shuffle_channels>() && !input.is_type<strided_slice>() && !input.is_type<cum_sum>() &&
-                 !input.is_type<fused_conv_eltwise>() && !input.is_type<activation>()))
+                 !input.is_type<embedding_bag>() && !input.is_type<fused_conv_eltwise>() &&
+                 !input.is_type<activation>()))
                 return;
 
             if (input.is_type<eltwise>()) {
diff --git a/inference-engine/thirdparty/clDNN/src/include/embedding_bag_inst.h b/inference-engine/thirdparty/clDNN/src/include/embedding_bag_inst.h
new file mode 100644
index 00000000000..f24dc7d72fc
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/src/include/embedding_bag_inst.h
@@ -0,0 +1,49 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#pragma once
+#include "api/embedding_bag.hpp"
+
+#include "primitive_inst.h"
+#include <string>
+
+namespace cldnn {
+template <>
+struct typed_program_node<embedding_bag> : public typed_program_node_base<embedding_bag> {
+    using parent = typed_program_node_base<embedding_bag>;
+
+public:
+    using parent::parent;
+
+    program_node& input(size_t index = 0) const { return get_dependency(index); }
+    size_t inputs_count() const { return get_dependencies().size(); }
+};
+
+using embedding_bag_node = typed_program_node<embedding_bag>;
+
+template <>
+class typed_primitive_inst<embedding_bag> : public typed_primitive_inst_base<embedding_bag> {
+    using parent = typed_primitive_inst_base<embedding_bag>;
+
+public:
+    static layout calc_output_layout(embedding_bag_node const& node);
+    static std::string to_string(embedding_bag_node const& node);
+    typed_primitive_inst(network_impl& network, embedding_bag_node const& desc);
+};
+
+using embedding_bag_inst = typed_primitive_inst<embedding_bag>;
+}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/embedding_bag_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/embedding_bag_gpu_test.cpp
new file mode 100644
index 00000000000..26166aa37f0
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/embedding_bag_gpu_test.cpp
@@ -0,0 +1,1433 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include <gtest/gtest.h>
+
+#include <api/data.hpp>
+#include <api/embedding_bag.hpp>
+#include <api/input_layout.hpp>
+#include <api/memory.hpp>
+#include <api/network.hpp>
+#include <api/topology.hpp>
+
+#include <cstddef>
+#include <tests/test_utils/test_utils.h>
+
+using namespace cldnn;
+using namespace ::tests;
+
+TEST(embedding_bag_fp16_gpu, packed_sum_basic) {
+    //  emb_table : 5x2
+    //  indices : 3x2
+    //  per_sample_weights : 3x2
+    //  Output : 3x2
+    //  Input values in fp16
+    const auto& engine = get_test_engine();
+
+    auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } });
+    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 2, 1, 1 } });
+    auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 3, 2, 1, 1 } });
+    tensor output_shape = {3, 2, 1, 1};
+
+    set_values(emb_table, {
+            FLOAT16(-0.2f), FLOAT16(-0.6f),
+            FLOAT16(-0.1f), FLOAT16(-0.4f),
+            FLOAT16(-1.9f), FLOAT16(-1.8f),
+            FLOAT16(-1.0f), FLOAT16(1.5f),
+            FLOAT16(0.8f), FLOAT16(-0.7f)
+    });
+    set_values<int32_t>(indices, {
+            0, 2,
+            1, 2,
+            3, 4
+    });
+    set_values(per_sample_weights, {
+            FLOAT16(0.5f), FLOAT16(0.5f),
+            FLOAT16(0.5f), FLOAT16(0.5f),
+            FLOAT16(0.5f), FLOAT16(0.5f)
+    });
+
+    auto type = embedding_bag::packed_sum;
+    topology topology;
+    topology.add(input_layout("Input0", emb_table.get_layout()));
+    topology.add(input_layout("Input1", indices.get_layout()));
+    topology.add(data("Input2", per_sample_weights));
+    topology.add(
+            embedding_bag("embedding_bag", {"Input0", "Input1", "Input2"}, type, output_shape)
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("Input0", emb_table);
+    network.set_input_data("Input1", indices);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("embedding_bag").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    std::vector<float> expected_results = {
+            -1.05f, -1.2f,
+            -1.f, -1.1f,
+            -0.1f, 0.4f
+    };
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i;
+    }
+}
+
+TEST(embedding_bag_fp16_gpu, packed_sum_basic_without_weights) {
+    //  emb_table : 5x2
+    //  indices : 3x2
+    //  Output : 3x2
+    //  Input values in fp16
+    const auto& engine = get_test_engine();
+
+    auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } });
+    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 2, 1, 1 } });
+    tensor output_shape = {3, 2, 1, 1};
+
+    set_values(emb_table, {
+            FLOAT16(-0.2f), FLOAT16(-0.6f),
+            FLOAT16(-0.1f), FLOAT16(-0.4f),
+            FLOAT16(-1.9f), FLOAT16(-1.8f),
+            FLOAT16(-1.0f), FLOAT16(1.5f),
+            FLOAT16(0.8f), FLOAT16(-0.7f)
+    });
+    set_values<int32_t>(indices, {
+            0, 2,
+            1, 2,
+            3, 4
+    });
+
+    auto type = embedding_bag::packed_sum;
+    topology topology;
+    topology.add(input_layout("Input0", emb_table.get_layout()));
+    topology.add(input_layout("Input1", indices.get_layout()));
+    topology.add(
+            embedding_bag("embedding_bag", {"Input0", "Input1"}, type, output_shape)
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("Input0", emb_table);
+    network.set_input_data("Input1", indices);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("embedding_bag").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    std::vector<float> expected_results = {
+            -2.1f, -2.4f,
+            -2.f, -2.2f,
+            -0.2f, 0.8f
+    };
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i;
+    }
+}
+
+TEST(embedding_bag_fp16_gpu, packed_sum_dim2) {
+    //  emb_table : 5x2x2
+    //  indices : 3x2
+    //  per_sample_weights : 3x2
+    //  Output : 3x2x2
+    //  Input values in fp16
+    //  Input values in fp16
+    const auto& engine = get_test_engine();
+
+    auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 2, 1 } });
+    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 2, 1, 1 } });
+    auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 3, 2, 1, 1 } });
+    tensor output_shape = {3, 2, 2, 1};
+
+    /*
+     * [ 5
+     *   [ 2
+     *       [ -0.2, 1.3 ], [ 0.5, -0.3 ]
+     *   ],
+     *   [ 2
+     *       [ 2.3, 1.3 ], [ -0.4, -0.7 ]
+     *   ],
+     *   [ 2
+     *       [ 3.3, -4.1 ], [ 2.1, 0.8 ]
+     *   ],
+     *   [ 2
+     *       [ 3.5, -5.7 ], [ -0.1, 0.3 ]
+     *   ],
+     *   [ 2
+     *       [ 0.3, 1.0 ], [ 2.3, -4.1 ]
+     *   ]
+     * ]
+     */
+    set_values(emb_table, {
+            FLOAT16(-0.2f), FLOAT16( 1.3f), FLOAT16( 0.5f), FLOAT16(-0.3f),
+            FLOAT16( 2.3f), FLOAT16( 1.3f), FLOAT16(-0.4f), FLOAT16(-0.7f),
+            FLOAT16( 3.3f), FLOAT16(-4.1f), FLOAT16( 2.1f), FLOAT16( 0.8f),
+            FLOAT16( 3.5f), FLOAT16(-5.7f), FLOAT16(-0.1f), FLOAT16( 0.3f),
+            FLOAT16( 0.3f), FLOAT16( 1.0f), FLOAT16( 2.3f), FLOAT16(-4.1f)
+    });
+    set_values<int32_t>(indices, {
+            0, 2,
+            1, 2,
+            3, 4
+    });
+    set_values(per_sample_weights, {
+            FLOAT16(0.5f), FLOAT16(0.5f),
+            FLOAT16(0.5f), FLOAT16(0.5f),
+            FLOAT16(0.5f), FLOAT16(0.5f)
+    });
+
+    auto type = embedding_bag::packed_sum;
+    topology topology;
+    topology.add(input_layout("Input0", emb_table.get_layout()));
+    topology.add(input_layout("Input1", indices.get_layout()));
+    topology.add(data("Input2", per_sample_weights));
+    topology.add(
+            embedding_bag("embedding_bag", {"Input0", "Input1", "Input2"}, type, output_shape)
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("Input0", emb_table);
+    network.set_input_data("Input1", indices);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("embedding_bag").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    /*
+     * [ 3
+     *   [ 2
+     *       [ 1.55, -1.4 ], [ 1.3, 0.25 ]
+     *   ],
+     *   [ 2
+     *       [ 2.8, -1.4 ], [ 0.85, 0.05 ]
+     *   ],
+     *   [ 2
+     *       [ 1.9, -2.35 ], [ 1.1, -1.9 ]
+     *   ],
+     * ]
+     */
+    std::vector<float> expected_results = {
+            1.55f,  -1.4f,  1.3f,  0.25f,
+             2.8f,  -1.4f, 0.85f,  0.05f,
+             1.9f, -2.35f,  1.1f,  -1.9f
+    };
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]), static_cast<float>(1e-2))) << i;
+    }
+}
+
+TEST(embedding_bag_fp16_gpu, packed_sum_dim3) {
+    //  emb_table : 5x2x3x2
+    //  indices : 3x2
+    //  per_sample_weights : 3x2
+    //  Output : 3x2x3x2
+    //  Input values in fp16
+    const auto& engine = get_test_engine();
+
+    auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 3, 2 } });
+    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 2, 1, 1 } });
+    auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 3, 2, 1, 1 } });
+    tensor output_shape = {3, 2, 3, 2};
+
+    /*
+     * [ 5
+     *   [ 2
+     *     [ 3
+     *       [ -0.2, 1.3 ], [ 0.5, -0.3 ], [ 0.4, -0.4 ]
+     *     ],
+     *     [ 3
+     *       [ -0.1, 1.0 ], [ 2.1, 0.7 ], [ -0.2, -0.7 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 1.9, -2.4 ], [ 3.4, -0.7 ], [ -0.4, 0.5 ]
+     *     ],
+     *     [ 3
+     *       [ 2.3, 1.3 ], [ -0.4, -0.7 ], [ 1.8, -0.9 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 1.5, -2.4 ], [ 4.2, 3.2 ], [ -0.6, 0.9 ]
+     *     ],
+     *     [ 3
+     *       [ 3.3, -4.1 ], [ 2.1, 0.8 ], [ 5.2, -2.5 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 0.8, -1.9 ], [ 0.7, 3.4 ], [ -3.3, 0.1 ]
+     *     ],
+     *     [ 3
+     *       [ 3.5, -5.7 ], [ -0.1, 0.3 ], [ 0.4, 3.3 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 6.1, 8.3 ], [ 0.4, -4.4 ], [ -5.2, 0.9 ]
+     *     ],
+     *     [ 3
+     *       [ 0.3, 1.0 ], [ 2.3, -4.1 ], [ 2.0, -5.7 ]
+     *     ],
+     *   ]
+     * ]
+     */
+    set_values(emb_table, {
+            FLOAT16(-0.2f), FLOAT16( 1.3f), FLOAT16( 0.5f), FLOAT16(-0.3f), FLOAT16( 0.4f), FLOAT16(-0.4f),
+            FLOAT16(-0.1f), FLOAT16( 1.0f), FLOAT16( 2.1f), FLOAT16( 0.7f), FLOAT16(-0.2f), FLOAT16(-0.7f),
+            FLOAT16( 1.9f), FLOAT16(-2.4f), FLOAT16( 3.4f), FLOAT16(-0.7f), FLOAT16(-0.4f), FLOAT16( 0.5f),
+            FLOAT16( 2.3f), FLOAT16( 1.3f), FLOAT16(-0.4f), FLOAT16(-0.7f), FLOAT16( 1.8f), FLOAT16(-0.9f),
+            FLOAT16( 1.5f), FLOAT16(-2.4f), FLOAT16( 4.2f), FLOAT16( 3.2f), FLOAT16(-0.6f), FLOAT16( 0.9f),
+            FLOAT16( 3.3f), FLOAT16(-4.1f), FLOAT16( 2.1f), FLOAT16( 0.8f), FLOAT16( 5.2f), FLOAT16(-2.5f),
+            FLOAT16( 0.8f), FLOAT16(-1.9f), FLOAT16( 0.7f), FLOAT16( 3.4f), FLOAT16(-3.3f), FLOAT16( 0.1f),
+            FLOAT16( 3.5f), FLOAT16(-5.7f), FLOAT16(-0.1f), FLOAT16( 0.3f), FLOAT16( 0.4f), FLOAT16( 3.3f),
+            FLOAT16( 6.1f), FLOAT16( 8.3f), FLOAT16( 0.4f), FLOAT16(-4.4f), FLOAT16(-5.2f), FLOAT16( 0.9f),
+            FLOAT16( 0.3f), FLOAT16( 1.0f), FLOAT16( 2.3f), FLOAT16(-4.1f), FLOAT16( 2.0f), FLOAT16(-5.7f)
+    });
+    set_values<int32_t>(indices, {
+            0, 2,
+            1, 2,
+            3, 4
+    });
+    set_values(per_sample_weights, {
+            FLOAT16(0.5f), FLOAT16(0.5f),
+            FLOAT16(0.5f), FLOAT16(0.5f),
+            FLOAT16(0.5f), FLOAT16(0.5f)
+    });
+
+    auto type = embedding_bag::packed_sum;
+    topology topology;
+    topology.add(input_layout("Input0", emb_table.get_layout()));
+    topology.add(input_layout("Input1", indices.get_layout()));
+    topology.add(data("Input2", per_sample_weights));
+    topology.add(
+            embedding_bag("embedding_bag", {"Input0", "Input1", "Input2"}, type, output_shape)
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("Input0", emb_table);
+    network.set_input_data("Input1", indices);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("embedding_bag").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    /*
+     * [ 3
+     *   [ 2
+     *     [ 3
+     *       [ 0.65, -0.55 ], [ 2.35, 1.45 ], [ -0.1, 0.25 ]
+     *     ],
+     *     [ 3
+     *       [ 1.6, -1.55 ], [ 2.1, 0.75 ], [ 2.5, -1.6 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 1.7, -2.4 ], [ 3.8, 1.25 ], [ -0.5, 0.7 ]
+     *     ],
+     *     [ 3
+     *       [ 2.8, -1.4 ], [ 0.85, 0.05 ], [ 3.5, -1.7 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 3.45, 3.2 ], [ 0.55, -0.5 ], [ -4.25, 0.5 ]
+     *     ],
+     *     [ 3
+     *       [ 1.9, -2.35 ], [ 1.1, -1.9 ], [ 1.2, -1.2 ]
+     *     ],
+     *   ]
+     * ]
+     */
+    std::vector<float> expected_results = {
+        0.65f, -0.55f, 2.35f, 1.45f,  -0.1f, 0.25f,
+         1.6f, -1.55f,  2.1f, 0.75f,   2.5f, -1.6f,
+         1.7f,  -2.4f,  3.8f, 1.25f,  -0.5f,  0.7f,
+         2.8f,  -1.4f, 0.85f, 0.05f,   3.5f, -1.7f,
+        3.45f,   3.2f, 0.55f, -0.5f, -4.25f,  0.5f,
+         1.9f, -2.35f,  1.1f, -1.9f,   1.2f, -1.2f
+    };
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]), static_cast<float>(1e-2))) << i;
+    }
+}
+
+TEST(embedding_bag_fp16_gpu, offsets_sum_basic) {
+    //  emb_table : 5x2
+    //  indices : 4x1
+    //  offsets : 3x1
+    //  per_sample_weights : 4x1
+    //  default_index : 1x1
+    //  Output : 3x2
+    //  Input values in fp16
+    const auto& engine = get_test_engine();
+
+    auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } });
+    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+    auto offsets = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 1, 1, 1 } });
+    auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 4, 1, 1, 1 } });
+    tensor output_shape = {3, 2, 1, 1};
+
+    set_values(emb_table, {
+            FLOAT16(-0.2f), FLOAT16(-0.6f),
+            FLOAT16(-0.1f), FLOAT16(-0.4f),
+            FLOAT16(-1.9f), FLOAT16(-1.8f),
+            FLOAT16(-1.0f), FLOAT16(1.5f),
+            FLOAT16(0.8f), FLOAT16(-0.7f)
+    });
+    set_values<int32_t>(indices, {
+            0, 2, 3, 4
+    });
+    set_values<int32_t>(offsets, {
+            0, 2, 2
+    });
+    set_values(per_sample_weights, {
+            FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f)
+    });
+
+    auto type = embedding_bag::offsets_sum;
+    topology topology;
+    topology.add(input_layout("Input0", emb_table.get_layout()));
+    topology.add(input_layout("Input1", indices.get_layout()));
+    topology.add(input_layout("Input2", offsets.get_layout()));
+    topology.add(data("Input3", per_sample_weights));
+    topology.add(
+            embedding_bag("embedding_bag", {"Input0", "Input1", "Input2", "Input3"}, type, output_shape, 0)
+    );
+    network network(engine, topology);
+
+    network.set_input_data("Input0", emb_table);
+    network.set_input_data("Input1", indices);
+    network.set_input_data("Input2", offsets);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("embedding_bag").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    std::vector<float> expected_results = {
+            -1.05f, -1.2f,
+            -0.2f, -0.6f,
+            -0.1f, 0.4f
+    };
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i;
+    }
+}
+
+TEST(embedding_bag_fp16_gpu, offsets_sum_basic_first_empty) {
+    //  emb_table : 5x2
+    //  indices : 4x1
+    //  offsets : 3x1
+    //  per_sample_weights : 4x1
+    //  default_index : 1x1
+    //  Output : 3x2
+    //  Input values in fp16
+    const auto& engine = get_test_engine();
+
+    auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } });
+    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+    auto offsets = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 1, 1, 1 } });
+    auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 4, 1, 1, 1 } });
+    tensor output_shape = {3, 2, 1, 1};
+
+    set_values(emb_table, {
+            FLOAT16(-0.2f), FLOAT16(-0.6f),
+            FLOAT16(-0.1f), FLOAT16(-0.4f),
+            FLOAT16(-1.9f), FLOAT16(-1.8f),
+            FLOAT16(-1.0f), FLOAT16(1.5f),
+            FLOAT16(0.8f), FLOAT16(-0.7f)
+    });
+    set_values<int32_t>(indices, {
+            0, 2, 3, 4
+    });
+    set_values<int32_t>(offsets, {
+            0, 0, 2
+    });
+    set_values(per_sample_weights, {
+            FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f)
+    });
+
+    auto type = embedding_bag::offsets_sum;
+    topology topology;
+    topology.add(input_layout("Input0", emb_table.get_layout()));
+    topology.add(input_layout("Input1", indices.get_layout()));
+    topology.add(input_layout("Input2", offsets.get_layout()));
+    topology.add(data("Input3", per_sample_weights));
+    topology.add(
+            embedding_bag("embedding_bag", {"Input0", "Input1", "Input2", "Input3"}, type, output_shape, 2)
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("Input0", emb_table);
+    network.set_input_data("Input1", indices);
+    network.set_input_data("Input2", offsets);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("embedding_bag").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    std::vector<float> expected_results = {
+            -1.9f, -1.8f,
+            -1.05f, -1.2f,
+            -0.1f, 0.4f
+    };
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i;
+    }
+}
+
+TEST(embedding_bag_fp16_gpu, offsets_sum_basic_last_empty) {
+    //  emb_table : 5x2
+    //  indices : 4x1
+    //  offsets : 3x1
+    //  per_sample_weights : 4x1
+    //  default_index : 1x1
+    //  Output : 3x2
+    //  Input values in fp16
+    const auto& engine = get_test_engine();
+
+    auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } });
+    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+    auto offsets = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 1, 1, 1 } });
+    auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 4, 1, 1, 1 } });
+    tensor output_shape = {3, 2, 1, 1};
+
+    set_values(emb_table, {
+            FLOAT16(-0.2f), FLOAT16(-0.6f),
+            FLOAT16(-0.1f), FLOAT16(-0.4f),
+            FLOAT16(-1.9f), FLOAT16(-1.8f),
+            FLOAT16(-1.0f), FLOAT16(1.5f),
+            FLOAT16(0.8f), FLOAT16(-0.7f)
+    });
+    set_values<int32_t>(indices, {
+            0, 2, 3, 4
+    });
+    set_values<int32_t>(offsets, {
+            0, 2, 4
+    });
+    set_values(per_sample_weights, {
+            FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f)
+    });
+
+    auto type = embedding_bag::offsets_sum;
+    topology topology;
+    topology.add(input_layout("Input0", emb_table.get_layout()));
+    topology.add(input_layout("Input1", indices.get_layout()));
+    topology.add(input_layout("Input2", offsets.get_layout()));
+    topology.add(data("Input3", per_sample_weights));
+    topology.add(
+            embedding_bag("embedding_bag", {"Input0", "Input1", "Input2", "Input3"}, type, output_shape, 2)
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("Input0", emb_table);
+    network.set_input_data("Input1", indices);
+    network.set_input_data("Input2", offsets);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("embedding_bag").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    std::vector<float> expected_results = {
+            -1.05f, -1.2f,
+            -0.1f, 0.4f,
+            -1.9f, -1.8f
+    };
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i;
+    }
+}
+
+TEST(embedding_bag_fp16_gpu, offsets_sum_without_weights_and_def_index) {
+    //  emb_table : 5x2
+    //  indices : 4x1
+    //  offsets : 3x1
+    //  Output : 3x2
+    //  Input values in fp16
+    const auto& engine = get_test_engine();
+
+    auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } });
+    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+    auto offsets = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 1, 1, 1 } });
+    tensor output_shape = {3, 2, 1, 1};
+
+    set_values(emb_table, {
+            FLOAT16(-0.2f), FLOAT16(-0.6f),
+            FLOAT16(-0.1f), FLOAT16(-0.4f),
+            FLOAT16(-1.9f), FLOAT16(-1.8f),
+            FLOAT16(-1.0f), FLOAT16(1.5f),
+            FLOAT16(0.8f), FLOAT16(-0.7f)
+    });
+    set_values<int32_t>(indices, {
+            0, 2, 3, 4
+    });
+    set_values<int32_t>(offsets, {
+            0, 2, 2
+    });
+
+    auto type = embedding_bag::offsets_sum;
+    topology topology;
+    topology.add(input_layout("Input0", emb_table.get_layout()));
+    topology.add(input_layout("Input1", indices.get_layout()));
+    topology.add(input_layout("Input2", offsets.get_layout()));
+    topology.add(
+            embedding_bag("embedding_bag", {"Input0", "Input1", "Input2"}, type, output_shape)
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("Input0", emb_table);
+    network.set_input_data("Input1", indices);
+    network.set_input_data("Input2", offsets);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("embedding_bag").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    std::vector<float> expected_results = {
+            -2.1f, -2.4f,
+                0,     0,
+            -0.2f,  0.8f
+    };
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i;
+    }
+}
+
+TEST(embedding_bag_fp16_gpu, offsets_sum_dim3) {
+    //  emb_table : 5x2x3x2
+    //  indices : 4x1
+    //  offsets : 3x1
+    //  per_sample_weights : 4x1
+    //  default_index : 1x1
+    //  Output : 3x2x3x2
+    //  Input values in fp16
+    const auto& engine = get_test_engine();
+
+    auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 3, 2 } });
+    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+    auto offsets = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 1, 1, 1 } });
+    auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 4, 1, 1, 1 } });
+    tensor output_shape = {3, 2, 3, 2};
+
+    /*
+     * [ 5
+     *   [ 2
+     *     [ 3
+     *       [ -0.2, 1.3 ], [ 0.5, -0.3 ], [ 0.4, -0.4 ]
+     *     ],
+     *     [ 3
+     *       [ -0.1, 1.0 ], [ 2.1, 0.7 ], [ -0.2, -0.7 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 1.9, -2.4 ], [ 3.4, -0.7 ], [ -0.4, 0.5 ]
+     *     ],
+     *     [ 3
+     *       [ 2.3, 1.3 ], [ -0.4, -0.7 ], [ 1.8, -0.9 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 1.5, -2.4 ], [ 4.2, 3.2 ], [ -0.6, 0.9 ]
+     *     ],
+     *     [ 3
+     *       [ 3.3, -4.1 ], [ 2.1, 0.8 ], [ 5.2, -2.5 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 0.8, -1.9 ], [ 0.7, 3.4 ], [ -3.3, 0.1 ]
+     *     ],
+     *     [ 3
+     *       [ 3.5, -5.7 ], [ -0.1, 0.3 ], [ 0.4, 3.3 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 6.1, 8.3 ], [ 0.4, -4.4 ], [ -5.2, 0.9 ]
+     *     ],
+     *     [ 3
+     *       [ 0.3, 1.0 ], [ 2.3, -4.1 ], [ 2.0, -5.7 ]
+     *     ],
+     *   ]
+     * ]
+     */
+    set_values(emb_table, {
+            FLOAT16(-0.2f), FLOAT16( 1.3f), FLOAT16( 0.5f), FLOAT16(-0.3f), FLOAT16( 0.4f), FLOAT16(-0.4f),
+            FLOAT16(-0.1f), FLOAT16( 1.0f), FLOAT16( 2.1f), FLOAT16( 0.7f), FLOAT16(-0.2f), FLOAT16(-0.7f),
+            FLOAT16( 1.9f), FLOAT16(-2.4f), FLOAT16( 3.4f), FLOAT16(-0.7f), FLOAT16(-0.4f), FLOAT16( 0.5f),
+            FLOAT16( 2.3f), FLOAT16( 1.3f), FLOAT16(-0.4f), FLOAT16(-0.7f), FLOAT16( 1.8f), FLOAT16(-0.9f),
+            FLOAT16( 1.5f), FLOAT16(-2.4f), FLOAT16( 4.2f), FLOAT16( 3.2f), FLOAT16(-0.6f), FLOAT16( 0.9f),
+            FLOAT16( 3.3f), FLOAT16(-4.1f), FLOAT16( 2.1f), FLOAT16( 0.8f), FLOAT16( 5.2f), FLOAT16(-2.5f),
+            FLOAT16( 0.8f), FLOAT16(-1.9f), FLOAT16( 0.7f), FLOAT16( 3.4f), FLOAT16(-3.3f), FLOAT16( 0.1f),
+            FLOAT16( 3.5f), FLOAT16(-5.7f), FLOAT16(-0.1f), FLOAT16( 0.3f), FLOAT16( 0.4f), FLOAT16( 3.3f),
+            FLOAT16( 6.1f), FLOAT16( 8.3f), FLOAT16( 0.4f), FLOAT16(-4.4f), FLOAT16(-5.2f), FLOAT16( 0.9f),
+            FLOAT16( 0.3f), FLOAT16( 1.0f), FLOAT16( 2.3f), FLOAT16(-4.1f), FLOAT16( 2.0f), FLOAT16(-5.7f)
+    });
+    set_values<int32_t>(indices, {
+            0, 2, 3, 4
+    });
+    set_values<int32_t>(offsets, {
+            0, 2, 2
+    });
+    set_values(per_sample_weights, {
+            FLOAT16(0.5f), FLOAT16(0.5f),
+            FLOAT16(0.5f), FLOAT16(0.5f)
+    });
+
+    auto type = embedding_bag::offsets_sum;
+    topology topology;
+    topology.add(input_layout("Input0", emb_table.get_layout()));
+    topology.add(input_layout("Input1", indices.get_layout()));
+    topology.add(input_layout("Input2", offsets.get_layout()));
+    topology.add(data("Input3", per_sample_weights));
+    topology.add(
+            embedding_bag("embedding_bag", {"Input0", "Input1", "Input2", "Input3"}, type, output_shape, 0)
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("Input0", emb_table);
+    network.set_input_data("Input1", indices);
+    network.set_input_data("Input2", offsets);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("embedding_bag").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    /*
+     * [ 3
+     *   [ 2
+     *     [ 3
+     *       [ 0.65, -0.55 ], [ 2.35, 1.45 ], [ -0.1, 0.25 ]
+     *     ],
+     *     [ 3
+     *       [ 1.6, -1.55 ], [ 2.1, 0.75 ], [ 2.5, -1.6 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ -0.2, 1.3 ], [ 0.5, -0.3 ], [ 0.4, -0.4 ]
+     *     ],
+     *     [ 3
+     *       [ -0.1, 1.0 ], [ 2.1, 0.7 ], [ -0.2, -0.7 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 3.45, 3.2 ], [ 0.55, -0.5 ], [ -4.25, 0.5 ]
+     *     ],
+     *     [ 3
+     *       [ 1.9, -2.35 ], [ 1.1, -1.9 ], [ 1.2, -1.2 ]
+     *     ],
+     *   ]
+     * ]
+     */
+    std::vector<float> expected_results = {
+        0.65f, -0.55f, 2.35f, 1.45f,  -0.1f, 0.25f,
+         1.6f, -1.55f,  2.1f, 0.75f,   2.5f, -1.6f,
+        -0.2f,   1.3f,  0.5f, -0.3f,   0.4f, -0.4f,
+        -0.1f,   1.0f,  2.1f,  0.7f,  -0.2f, -0.7f,
+        3.45f,   3.2f, 0.55f, -0.5f, -4.25f,  0.5f,
+         1.9f, -2.35f,  1.1f, -1.9f,   1.2f, -1.2f
+    };
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]), static_cast<float>(1e-2))) << i;
+    }
+}
+
+TEST(embedding_bag_fp16_gpu, segments_sum_basic) {
+    //  emb_table : 5x2
+    //  indices : 4x1
+    //  segment_ids : 4x1
+    //  per_sample_weights : 4x1
+    //  default_index : 1x1
+    //  Output : 3x2
+    //  Input values in fp16
+    const auto& engine = get_test_engine();
+
+    auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } });
+    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+    auto segment_ids = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+    auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 4, 1, 1, 1 } });
+    tensor output_shape = {3, 2, 1, 1};
+
+    set_values(emb_table, {
+            FLOAT16(-0.2f), FLOAT16(-0.6f),
+            FLOAT16(-0.1f), FLOAT16(-0.4f),
+            FLOAT16(-1.9f), FLOAT16(-1.8f),
+            FLOAT16(-1.0f), FLOAT16(1.5f),
+            FLOAT16(0.8f), FLOAT16(-0.7f)
+    });
+    set_values<int32_t>(indices, {
+            0, 2, 3, 4
+    });
+    set_values<int32_t>(segment_ids, {
+            0, 0, 2, 2
+    });
+    set_values(per_sample_weights, {
+            FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f)
+    });
+
+    auto type = embedding_bag::segments_sum;
+    topology topology;
+    topology.add(input_layout("Input0", emb_table.get_layout()));
+    topology.add(input_layout("Input1", indices.get_layout()));
+    topology.add(input_layout("Input2", segment_ids.get_layout()));
+    topology.add(data("Input3", per_sample_weights));
+    topology.add(
+            embedding_bag("embedding_bag", {"Input0", "Input1", "Input2", "Input3"}, type, output_shape, 0)
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("Input0", emb_table);
+    network.set_input_data("Input1", indices);
+    network.set_input_data("Input2", segment_ids);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("embedding_bag").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    std::vector<float> expected_results = {
+            -1.05f, -1.2f,
+            -0.2f, -0.6f,
+            -0.1f, 0.4f
+    };
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i;
+    }
+}
+
+TEST(embedding_bag_fp16_gpu, segments_sum_basic_first_empty) {
+    //  emb_table : 5x2
+    //  indices : 4x1
+    //  segment_ids : 4x1
+    //  per_sample_weights : 4x1
+    //  default_index : 1x1
+    //  Output : 3x2
+    //  Input values in fp16
+    const auto& engine = get_test_engine();
+
+    auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } });
+    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+    auto segment_ids = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+    auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 4, 1, 1, 1 } });
+    tensor output_shape = {3, 2, 1, 1};
+
+    set_values(emb_table, {
+            FLOAT16(-0.2f), FLOAT16(-0.6f),
+            FLOAT16(-0.1f), FLOAT16(-0.4f),
+            FLOAT16(-1.9f), FLOAT16(-1.8f),
+            FLOAT16(-1.0f), FLOAT16(1.5f),
+            FLOAT16(0.8f), FLOAT16(-0.7f)
+    });
+    set_values<int32_t>(indices, {
+            0, 2, 3, 4
+    });
+    set_values<int32_t>(segment_ids, {
+            1, 1, 2, 2
+    });
+    set_values(per_sample_weights, {
+            FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f)
+    });
+
+    auto type = embedding_bag::segments_sum;
+    topology topology;
+    topology.add(input_layout("Input0", emb_table.get_layout()));
+    topology.add(input_layout("Input1", indices.get_layout()));
+    topology.add(input_layout("Input2", segment_ids.get_layout()));
+    topology.add(data("Input3", per_sample_weights));
+    topology.add(
+            embedding_bag("embedding_bag", {"Input0", "Input1", "Input2", "Input3"}, type, output_shape, 2)
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("Input0", emb_table);
+    network.set_input_data("Input1", indices);
+    network.set_input_data("Input2", segment_ids);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("embedding_bag").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    std::vector<float> expected_results = {
+            -1.9f, -1.8f,
+            -1.05f, -1.2f,
+            -0.1f, 0.4f
+    };
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i;
+    }
+}
+
+TEST(embedding_bag_fp16_gpu, segments_sum_basic_last_empty) {
+    //  emb_table : 5x2
+    //  indices : 4x1
+    //  segment_ids : 4x1
+    //  per_sample_weights : 4x1
+    //  default_index : 1x1
+    //  Output : 3x2
+    //  Input values in fp16
+    const auto& engine = get_test_engine();
+
+    auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } });
+    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+    auto segment_ids = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+    auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 4, 1, 1, 1 } });
+    tensor output_shape = {3, 2, 1, 1};
+
+    set_values(emb_table, {
+            FLOAT16(-0.2f), FLOAT16(-0.6f),
+            FLOAT16(-0.1f), FLOAT16(-0.4f),
+            FLOAT16(-1.9f), FLOAT16(-1.8f),
+            FLOAT16(-1.0f), FLOAT16(1.5f),
+            FLOAT16(0.8f), FLOAT16(-0.7f)
+    });
+    set_values<int32_t>(indices, {
+            0, 2, 3, 4
+    });
+    set_values<int32_t>(segment_ids, {
+            0, 0, 1, 1
+    });
+    set_values(per_sample_weights, {
+            FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f), FLOAT16(0.5f)
+    });
+
+    auto type = embedding_bag::segments_sum;
+    topology topology;
+    topology.add(input_layout("Input0", emb_table.get_layout()));
+    topology.add(input_layout("Input1", indices.get_layout()));
+    topology.add(input_layout("Input2", segment_ids.get_layout()));
+    topology.add(data("Input3", per_sample_weights));
+    topology.add(
+            embedding_bag("embedding_bag", {"Input0", "Input1", "Input2", "Input3"}, type, output_shape, 2)
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("Input0", emb_table);
+    network.set_input_data("Input1", indices);
+    network.set_input_data("Input2", segment_ids);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("embedding_bag").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    std::vector<float> expected_results = {
+            -1.05f, -1.2f,
+            -0.1f, 0.4f,
+            -1.9f, -1.8f
+    };
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i;
+    }
+}
+
+TEST(embedding_bag_fp16_gpu, segments_sum_without_weights_and_def_index) {
+    //  emb_table : 5x2
+    //  indices : 4x1
+    //  segment_ids : 4x1
+    //  Output : 3x2
+    //  Input values in fp16
+    const auto& engine = get_test_engine();
+
+    auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 1, 1 } });
+    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+    auto segment_ids = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+    tensor output_shape = {3, 2, 1, 1};
+
+    set_values(emb_table, {
+            FLOAT16(-0.2f), FLOAT16(-0.6f),
+            FLOAT16(-0.1f), FLOAT16(-0.4f),
+            FLOAT16(-1.9f), FLOAT16(-1.8f),
+            FLOAT16(-1.0f), FLOAT16(1.5f),
+            FLOAT16(0.8f), FLOAT16(-0.7f)
+    });
+    set_values<int32_t>(indices, {
+            0, 2, 3, 4
+    });
+    set_values<int32_t>(segment_ids, {
+            0, 0, 2, 2
+    });
+
+    auto type = embedding_bag::segments_sum;
+    topology topology;
+    topology.add(input_layout("Input0", emb_table.get_layout()));
+    topology.add(input_layout("Input1", indices.get_layout()));
+    topology.add(input_layout("Input2", segment_ids.get_layout()));
+    topology.add(
+            embedding_bag("embedding_bag", {"Input0", "Input1", "Input2"}, type, output_shape)
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("Input0", emb_table);
+    network.set_input_data("Input1", indices);
+    network.set_input_data("Input2", segment_ids);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("embedding_bag").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    std::vector<float> expected_results = {
+            -2.1f, -2.4f,
+                0,     0,
+            -0.2f,  0.8f
+    };
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]))) << i;
+    }
+}
+
+TEST(embedding_bag_fp16_gpu, segments_sum_dim3) {
+    //  emb_table : 5x2x3x2
+    //  indices : 4x1
+    //  segment_ids : 4x1
+    //  per_sample_weights : 4x1
+    //  default_index : 1x1
+    //  Output : 3x2x3x2
+    //  Input values in fp16
+    const auto& engine = get_test_engine();
+
+    auto emb_table = memory::allocate(engine, { data_types::f16, format::bfyx, { 5, 2, 3, 2 } });
+    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+    auto segment_ids = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+    auto per_sample_weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 4, 1, 1, 1 } });
+    tensor output_shape = {3, 2, 3, 2};
+
+    /*
+     * [ 5
+     *   [ 2
+     *     [ 3
+     *       [ -0.2, 1.3 ], [ 0.5, -0.3 ], [ 0.4, -0.4 ]
+     *     ],
+     *     [ 3
+     *       [ -0.1, 1.0 ], [ 2.1, 0.7 ], [ -0.2, -0.7 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 1.9, -2.4 ], [ 3.4, -0.7 ], [ -0.4, 0.5 ]
+     *     ],
+     *     [ 3
+     *       [ 2.3, 1.3 ], [ -0.4, -0.7 ], [ 1.8, -0.9 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 1.5, -2.4 ], [ 4.2, 3.2 ], [ -0.6, 0.9 ]
+     *     ],
+     *     [ 3
+     *       [ 3.3, -4.1 ], [ 2.1, 0.8 ], [ 5.2, -2.5 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 0.8, -1.9 ], [ 0.7, 3.4 ], [ -3.3, 0.1 ]
+     *     ],
+     *     [ 3
+     *       [ 3.5, -5.7 ], [ -0.1, 0.3 ], [ 0.4, 3.3 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 6.1, 8.3 ], [ 0.4, -4.4 ], [ -5.2, 0.9 ]
+     *     ],
+     *     [ 3
+     *       [ 0.3, 1.0 ], [ 2.3, -4.1 ], [ 2.0, -5.7 ]
+     *     ],
+     *   ]
+     * ]
+     */
+    set_values(emb_table, {
+            FLOAT16(-0.2f), FLOAT16( 1.3f), FLOAT16( 0.5f), FLOAT16(-0.3f), FLOAT16( 0.4f), FLOAT16(-0.4f),
+            FLOAT16(-0.1f), FLOAT16( 1.0f), FLOAT16( 2.1f), FLOAT16( 0.7f), FLOAT16(-0.2f), FLOAT16(-0.7f),
+            FLOAT16( 1.9f), FLOAT16(-2.4f), FLOAT16( 3.4f), FLOAT16(-0.7f), FLOAT16(-0.4f), FLOAT16( 0.5f),
+            FLOAT16( 2.3f), FLOAT16( 1.3f), FLOAT16(-0.4f), FLOAT16(-0.7f), FLOAT16( 1.8f), FLOAT16(-0.9f),
+            FLOAT16( 1.5f), FLOAT16(-2.4f), FLOAT16( 4.2f), FLOAT16( 3.2f), FLOAT16(-0.6f), FLOAT16( 0.9f),
+            FLOAT16( 3.3f), FLOAT16(-4.1f), FLOAT16( 2.1f), FLOAT16( 0.8f), FLOAT16( 5.2f), FLOAT16(-2.5f),
+            FLOAT16( 0.8f), FLOAT16(-1.9f), FLOAT16( 0.7f), FLOAT16( 3.4f), FLOAT16(-3.3f), FLOAT16( 0.1f),
+            FLOAT16( 3.5f), FLOAT16(-5.7f), FLOAT16(-0.1f), FLOAT16( 0.3f), FLOAT16( 0.4f), FLOAT16( 3.3f),
+            FLOAT16( 6.1f), FLOAT16( 8.3f), FLOAT16( 0.4f), FLOAT16(-4.4f), FLOAT16(-5.2f), FLOAT16( 0.9f),
+            FLOAT16( 0.3f), FLOAT16( 1.0f), FLOAT16( 2.3f), FLOAT16(-4.1f), FLOAT16( 2.0f), FLOAT16(-5.7f)
+    });
+    set_values<int32_t>(indices, {
+            0, 2, 3, 4
+    });
+    set_values<int32_t>(segment_ids, {
+            0, 0, 2, 2
+    });
+    set_values(per_sample_weights, {
+            FLOAT16(0.5f), FLOAT16(0.5f),
+            FLOAT16(0.5f), FLOAT16(0.5f)
+    });
+
+    auto type = embedding_bag::segments_sum;
+    topology topology;
+    topology.add(input_layout("Input0", emb_table.get_layout()));
+    topology.add(input_layout("Input1", indices.get_layout()));
+    topology.add(input_layout("Input2", segment_ids.get_layout()));
+    topology.add(data("Input3", per_sample_weights));
+    topology.add(
+            embedding_bag("embedding_bag", {"Input0", "Input1", "Input2", "Input3"}, type, output_shape, 0)
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("Input0", emb_table);
+    network.set_input_data("Input1", indices);
+    network.set_input_data("Input2", segment_ids);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("embedding_bag").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    /*
+     * [ 3
+     *   [ 2
+     *     [ 3
+     *       [ 0.65, -0.55 ], [ 2.35, 1.45 ], [ -0.1, 0.25 ]
+     *     ],
+     *     [ 3
+     *       [ 1.6, -1.55 ], [ 2.1, 0.75 ], [ 2.5, -1.6 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ -0.2, 1.3 ], [ 0.5, -0.3 ], [ 0.4, -0.4 ]
+     *     ],
+     *     [ 3
+     *       [ -0.1, 1.0 ], [ 2.1, 0.7 ], [ -0.2, -0.7 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 3.45, 3.2 ], [ 0.55, -0.5 ], [ -4.25, 0.5 ]
+     *     ],
+     *     [ 3
+     *       [ 1.9, -2.35 ], [ 1.1, -1.9 ], [ 1.2, -1.2 ]
+     *     ],
+     *   ]
+     * ]
+     */
+    std::vector<float> expected_results = {
+        0.65f, -0.55f, 2.35f, 1.45f,  -0.1f, 0.25f,
+         1.6f, -1.55f,  2.1f, 0.75f,   2.5f, -1.6f,
+        -0.2f,   1.3f,  0.5f, -0.3f,   0.4f, -0.4f,
+        -0.1f,   1.0f,  2.1f,  0.7f,  -0.2f, -0.7f,
+        3.45f,   3.2f, 0.55f, -0.5f, -4.25f,  0.5f,
+         1.9f, -2.35f,  1.1f, -1.9f,   1.2f, -1.2f
+    };
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_TRUE(are_equal(expected_results[i], float16_to_float32(output_ptr[i]), static_cast<float>(1e-2))) << i;
+    }
+}
+
+TEST(embedding_bag_fp32_gpu, packed_sum_basic) {
+    //  emb_table : 5x2
+    //  indices : 3x2
+    //  per_sample_weights : 3x2
+    //  Output : 3x2
+    //  Input values in fp16
+    const auto& engine = get_test_engine();
+
+    auto emb_table = memory::allocate(engine, { data_types::f32, format::bfyx, { 5, 2, 1, 1 } });
+    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 2, 1, 1 } });
+    auto per_sample_weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 2, 1, 1 } });
+    tensor output_shape = {3, 2, 1, 1};
+
+    set_values(emb_table, {
+            -0.2f, -0.6f,
+            -0.1f, -0.4f,
+            -1.9f, -1.8f,
+            -1.0f, 1.5f,
+            0.8f, -0.7f
+    });
+    set_values<int32_t>(indices, {
+            0, 2,
+            1, 2,
+            3, 4
+    });
+    set_values(per_sample_weights, {
+            0.5f, 0.5f,
+            0.5f, 0.5f,
+            0.5f, 0.5f
+    });
+
+    auto type = embedding_bag::packed_sum;
+    topology topology;
+    topology.add(input_layout("Input0", emb_table.get_layout()));
+    topology.add(input_layout("Input1", indices.get_layout()));
+    topology.add(data("Input2", per_sample_weights));
+    topology.add(
+            embedding_bag("embedding_bag", {"Input0", "Input1", "Input2"}, type, output_shape)
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("Input0", emb_table);
+    network.set_input_data("Input1", indices);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("embedding_bag").get_memory();
+    auto output_ptr = output.pointer<float>();
+
+    std::vector<float> expected_results = {
+            -1.05f, -1.2f,
+            -1.f, -1.1f,
+            -0.1f, 0.4f
+    };
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_TRUE(are_equal(expected_results[i], output_ptr[i])) << i;
+    }
+}
+
+TEST(embedding_bag_fp32_gpu, packed_sum_dim3) {
+    //  emb_table : 5x2x3x2
+    //  indices : 3x2
+    //  per_sample_weights : 3x2
+    //  Output : 3x2x3x2
+    //  Input values in fp16
+    const auto& engine = get_test_engine();
+
+    auto emb_table = memory::allocate(engine, { data_types::f32, format::bfyx, { 5, 2, 3, 2 } });
+    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 2, 1, 1 } });
+    auto per_sample_weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 2, 1, 1 } });
+    tensor output_shape = {3, 2, 3, 2};
+
+    /*
+     * [ 5
+     *   [ 2
+     *     [ 3
+     *       [ -0.2, 1.3 ], [ 0.5, -0.3 ], [ 0.4, -0.4 ]
+     *     ],
+     *     [ 3
+     *       [ -0.1, 1.0 ], [ 2.1, 0.7 ], [ -0.2, -0.7 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 1.9, -2.4 ], [ 3.4, -0.7 ], [ -0.4, 0.5 ]
+     *     ],
+     *     [ 3
+     *       [ 2.3, 1.3 ], [ -0.4, -0.7 ], [ 1.8, -0.9 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 1.5, -2.4 ], [ 4.2, 3.2 ], [ -0.6, 0.9 ]
+     *     ],
+     *     [ 3
+     *       [ 3.3, -4.1 ], [ 2.1, 0.8 ], [ 5.2, -2.5 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 0.8, -1.9 ], [ 0.7, 3.4 ], [ -3.3, 0.1 ]
+     *     ],
+     *     [ 3
+     *       [ 3.5, -5.7 ], [ -0.1, 0.3 ], [ 0.4, 3.3 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 6.1, 8.3 ], [ 0.4, -4.4 ], [ -5.2, 0.9 ]
+     *     ],
+     *     [ 3
+     *       [ 0.3, 1.0 ], [ 2.3, -4.1 ], [ 2.0, -5.7 ]
+     *     ],
+     *   ]
+     * ]
+     */
+    set_values(emb_table, {
+            -0.2f,  1.3f,  0.5f, -0.3f,  0.4f, -0.4f,
+            -0.1f,  1.0f,  2.1f,  0.7f, -0.2f, -0.7f,
+             1.9f, -2.4f,  3.4f, -0.7f, -0.4f,  0.5f,
+             2.3f,  1.3f, -0.4f, -0.7f,  1.8f, -0.9f,
+             1.5f, -2.4f,  4.2f,  3.2f, -0.6f,  0.9f,
+             3.3f, -4.1f,  2.1f,  0.8f,  5.2f, -2.5f,
+             0.8f, -1.9f,  0.7f,  3.4f, -3.3f,  0.1f,
+             3.5f, -5.7f, -0.1f,  0.3f,  0.4f,  3.3f,
+             6.1f,  8.3f,  0.4f, -4.4f, -5.2f,  0.9f,
+             0.3f,  1.0f,  2.3f, -4.1f,  2.0f, -5.7f
+    });
+    set_values<int32_t>(indices, {
+            0, 2,
+            1, 2,
+            3, 4
+    });
+    set_values(per_sample_weights, {
+            0.5f, 0.5f,
+            0.5f, 0.5f,
+            0.5f, 0.5f
+    });
+
+    auto type = embedding_bag::packed_sum;
+    topology topology;
+    topology.add(input_layout("Input0", emb_table.get_layout()));
+    topology.add(input_layout("Input1", indices.get_layout()));
+    topology.add(data("Input2", per_sample_weights));
+    topology.add(
+            embedding_bag("embedding_bag", {"Input0", "Input1", "Input2"}, type, output_shape)
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("Input0", emb_table);
+    network.set_input_data("Input1", indices);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("embedding_bag").get_memory();
+    auto output_ptr = output.pointer<float>();
+
+    /*
+     * [ 3
+     *   [ 2
+     *     [ 3
+     *       [ 0.65, -0.55 ], [ 2.35, 1.45 ], [ -0.1, 0.25 ]
+     *     ],
+     *     [ 3
+     *       [ 1.6, -1.55 ], [ 2.1, 0.75 ], [ 2.5, -1.6 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 1.7, -2.4 ], [ 3.8, 1.25 ], [ -0.5, 0.7 ]
+     *     ],
+     *     [ 3
+     *       [ 2.8, -1.4 ], [ 0.85, 0.05 ], [ 3.5, -1.7 ]
+     *     ],
+     *   ],
+     *   [ 2
+     *     [ 3
+     *       [ 3.45, 3.2 ], [ 0.55, -0.5 ], [ -4.25, 0.5 ]
+     *     ],
+     *     [ 3
+     *       [ 1.9, -2.35 ], [ 1.1, -1.9 ], [ 1.2, -1.2 ]
+     *     ],
+     *   ]
+     * ]
+     */
+    std::vector<float> expected_results = {
+        0.65f, -0.55f, 2.35f, 1.45f,  -0.1f, 0.25f,
+         1.6f, -1.55f,  2.1f, 0.75f,   2.5f, -1.6f,
+         1.7f,  -2.4f,  3.8f, 1.25f,  -0.5f,  0.7f,
+         2.8f,  -1.4f, 0.85f, 0.05f,   3.5f, -1.7f,
+        3.45f,   3.2f, 0.55f, -0.5f, -4.25f,  0.5f,
+         1.9f, -2.35f,  1.1f, -1.9f,   1.2f, -1.2f
+    };
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_TRUE(are_equal(expected_results[i], output_ptr[i])) << i;
+    }
+}
+
+TEST(embedding_bag_fp32_gpu, extended5_6) {
+    //  emb_table : 5x2
+    //  indices : 3x2
+    //  per_sample_weights : 3x2
+    //  Output : 3x2
+    //  Input values in fp16
+    const auto& engine = get_test_engine();
+
+    auto emb_table = memory::allocate(engine, { data_types::f32, format::bfyx, { 5, 6, 1, 1 } });
+    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 5, 1, 1, 1 } });
+    auto segment_ids = memory::allocate(engine, { data_types::i32, format::bfyx, { 5, 1, 1, 1 } });
+    tensor output_shape = {5, 6, 1, 1};
+
+    set_values(emb_table, {
+            0.f, 1.f, 8.f,  5.f, 5.f,  2.f,
+            0.f, 7.f, 7.f, 10.f, 4.f,  5.f,
+            9.f, 0.f, 0.f,  5.f, 7.f,  0.f,
+            4.f, 0.f, 4.f,  7.f, 6.f, 10.f,
+            9.f, 5.f, 1.f,  7.f, 4.f,  7.f
+    });
+    set_values<int32_t>(indices, { 0, 1, 2, 2, 3 });
+    set_values<int32_t>(segment_ids, { 0, 0, 2, 2, 4 });
+
+    auto type = embedding_bag::segments_sum;
+    topology topology;
+    topology.add(input_layout("Input0", emb_table.get_layout()));
+    topology.add(input_layout("Input1", indices.get_layout()));
+    topology.add(input_layout("Input2", segment_ids.get_layout()));
+    topology.add(
+            embedding_bag("embedding_bag", {"Input0", "Input1", "Input2"}, type, output_shape)
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("Input0", emb_table);
+    network.set_input_data("Input1", indices);
+    network.set_input_data("Input2", segment_ids);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("embedding_bag").get_memory();
+    auto output_ptr = output.pointer<float>();
+
+    std::vector<float> expected_results = {
+            0, 8, 15,  15, 9,  7,
+            0, 0, 0, 0, 0,  0,
+            18, 0, 0,  10, 14,  0,
+            0, 0, 0,  0, 0,  0,
+            4, 0, 4,  7, 6, 10,
+    };
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_TRUE(are_equal(expected_results[i], output_ptr[i])) << i;
+    }
+}