[IE CLDNN] Added space_to_batch operation (#984)

2020-06-24 18:30:24 +03:00
parent cec12131e7
commit bc132056f9
20 changed files with 1275 additions and 5 deletions
--- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
@@ -82,7 +82,8 @@ InferenceEngine::ICNNNetwork::Ptr clDNNEngine::CloneNetwork(const InferenceEngin

            return std::dynamic_pointer_cast<const ::ngraph::opset2::Gelu>(node) ||
                   std::dynamic_pointer_cast<const ::ngraph::opset3::ShuffleChannels>(node) ||
-                   std::dynamic_pointer_cast<const ::ngraph::opset2::BatchToSpace>(node);
+                   std::dynamic_pointer_cast<const ::ngraph::opset2::BatchToSpace>(node) ||
+                   std::dynamic_pointer_cast<const ::ngraph::opset2::SpaceToBatch>(node);
        };
        auto nGraphFunc = clonedNetwork->getFunction();
        // Disable shape inference (WA for generic operations)
--- a/inference-engine/src/cldnn_engine/cldnn_program.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_program.cpp
@@ -49,6 +49,7 @@
 #include <api/depth_to_space.hpp>
 #include <api/space_to_depth.hpp>
 #include <api/batch_to_space.hpp>
+#include <api/space_to_batch.hpp>
 #include <api/shuffle_channels.hpp>
 #include <api/strided_slice.hpp>
 #include <api/reverse_sequence.hpp>
@@ -540,6 +541,7 @@ Program::LayerType Program::LayerTypeFromStr(const std::string &str) {
        { "DepthToSpace" , DepthToSpace },
        { "SpaceToDepth" , SpaceToDepth },
        { "BatchToSpace", BatchToSpace },
+        { "SpaceToBatch" , SpaceToBatch },
        { "ShuffleChannels" , ShuffleChannels },
        { "StridedSlice" , StridedSlice },
        { "ReverseSequence" , ReverseSequence },
@@ -1243,6 +1245,8 @@ void Program::CreateSingleLayerPrimitive(cldnn::topology& topology, InferenceEng
            break;
        case BatchToSpace: CreateBatchToSpacePrimitive(topology, layer);
            break;
+        case SpaceToBatch: CreateSpaceToBatchPrimitive(topology, layer);
+            break;
        case ShuffleChannels: CreateShuffleChannelsPrimitive(topology, layer);
            break;
        case StridedSlice: CreateStridedSlicePrimitive(topology, layer);
@@ -3914,6 +3918,62 @@ void Program::CreateBatchToSpacePrimitive(cldnn::topology& topology, InferenceEn
    AddPrimitiveToProfiler(batchToSpaceName, layer);
 }

+void Program::CreateSpaceToBatchPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr &layer) {
+    ValidateLayer(layer, 4);
+
+    auto inputPrimitives = GetPrevLayersPrimitives(layer);
+    auto spaceToBatch = as<InferenceEngine::GenericLayer*> (layer);
+    auto rank = spaceToBatch->input().get()->getTensorDesc().getDims().size();
+    auto format = FormatFromLayout(spaceToBatch->input()->getLayout());
+
+    std::vector<cldnn::tensor> inputs;
+    inputs.reserve(3);
+
+    for (size_t i = 1; i < 4; ++i) {
+        auto defaultIndexInput = layer->insData[i].lock();
+        auto defaultIndexInputCreator = defaultIndexInput->getCreatorLayer().lock();
+        if (defaultIndexInputCreator->blobs.size() == 1) {
+            auto constantBlob = defaultIndexInputCreator->blobs.begin()->second;
+            auto defaultIndexPrecision = constantBlob->getTensorDesc().getPrecision();
+            std::vector<int32_t> sizes;
+            sizes.reserve(rank);
+            int32_t default_size = i == 1 ? 1 : 0;
+            switch (defaultIndexPrecision) {
+                case InferenceEngine::Precision::I32: {
+                    auto data = constantBlob->buffer().as<int32_t*>();
+                    sizes = std::vector<int32_t>(data, data + rank);
+                    break;
+                }
+                case InferenceEngine::Precision::I64: {
+                    auto data = constantBlob->buffer().as<int64_t*>();
+                    std::vector<int64_t> sizes_i64 = std::vector<int64_t>(data, data + rank);
+                    for (size_t j = 0; j < sizes_i64.size(); ++j)
+                        sizes.emplace_back(static_cast<int32_t>(sizes_i64[j]));
+                    break;
+                }
+                default: {
+                    THROW_IE_EXCEPTION << layer->name << "Incorrect SpaceToBatch precision";
+                    break;
+                }
+            }
+            inputs.emplace_back(format, sizes, default_size);
+        }
+    }
+    auto out_size = CldnnTensorFromIEDims(spaceToBatch->outData[0]->getTensorDesc().getDims());
+
+    std::string spaceToBatchName = layer_type_name_ID(layer);
+    auto spaceToBatchPrim = cldnn::space_to_batch(
+            spaceToBatchName,
+            inputPrimitives[0], //input
+            inputs[0], //block_shape
+            inputs[1], //pads_begin
+            inputs[2], //pads_end
+            out_size);
+
+    topology.add(spaceToBatchPrim);
+    AddPrimitiveToProfiler(spaceToBatchName, layer);
+}
+
 void Program::CreateShuffleChannelsPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr &layer) {
    ValidateLayer(layer, 1);

--- a/inference-engine/src/cldnn_engine/cldnn_program.h
+++ b/inference-engine/src/cldnn_engine/cldnn_program.h
@@ -177,6 +177,7 @@ public:
        DepthToSpace,
        SpaceToDepth,
        BatchToSpace,
+        SpaceToBatch,
        ShuffleChannels,
        StridedSlice,
        Broadcast,
@@ -358,6 +359,7 @@ private:
    void CreateDepthToSpacePrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr &layer);
    void CreateSpaceToDepthPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr &layer);
    void CreateBatchToSpacePrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr &layer);
+    void CreateSpaceToBatchPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr &layer);
    void CreateShuffleChannelsPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr &layer);
    void CreateStridedSlicePrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr &layer);
    void CreateBroadcastPrimitive(cldnn::topology &topology, InferenceEngine::CNNLayerPtr &layer);
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/space_to_batch.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/space_to_batch.cpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "single_layer_tests/space_to_batch.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+
+spaceToBatchParamsTuple stb_only_test_cases[] = {
+        spaceToBatchParamsTuple({1, 1, 2, 2}, {0, 0, 0, 0}, {0, 0, 0, 0}, {1, 1, 2, 2},
+                                InferenceEngine::Precision::FP32, CommonTestUtils::DEVICE_GPU),
+        spaceToBatchParamsTuple({1, 1, 2, 2}, {0, 0, 0, 0}, {0, 0, 0, 0}, {1, 3, 2, 2},
+                                InferenceEngine::Precision::FP32, CommonTestUtils::DEVICE_GPU),
+        spaceToBatchParamsTuple({1, 1, 2, 2}, {0, 0, 0, 0}, {0, 0, 0, 0}, {1, 1, 4, 4},
+                                InferenceEngine::Precision::FP32, CommonTestUtils::DEVICE_GPU),
+        spaceToBatchParamsTuple({1, 1, 2, 2}, {0, 0, 0, 2}, {0, 0, 0, 0}, {2, 1, 2, 4},
+                                InferenceEngine::Precision::FP32, CommonTestUtils::DEVICE_GPU),
+        spaceToBatchParamsTuple({1, 1, 3, 2, 2}, {0, 0, 1, 0, 3}, {0, 0, 2, 0, 0}, {1, 1, 3, 2, 1},
+                                InferenceEngine::Precision::FP32, CommonTestUtils::DEVICE_GPU),
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_CLDNN, SpaceToBatchLayerTest, ::testing::ValuesIn(stb_only_test_cases),
+                        SpaceToBatchLayerTest::getTestCaseName);
+}  // namespace
--- a/inference-engine/thirdparty/clDNN/api/space_to_batch.hpp
+++ b/inference-engine/thirdparty/clDNN/api/space_to_batch.hpp
@@ -0,0 +1,86 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#pragma once
+#include "primitive.hpp"
+
+namespace cldnn {
+/// @addtogroup cpp_api C++ API
+/// @{
+/// @addtogroup cpp_topology Network Topology
+/// @{
+/// @addtogroup cpp_primitives Primitives
+/// @{
+
+/// @brief SpaceToBatch operation divides "spatial" dimensions [1, ..., N - 1], N ∈ {4,5,6} of the data input
+/// into a grid of blocks of shape block_shape, and interleaves these blocks with the batch dimension (0) such that in the output,
+/// the spatial dimensions [1, ..., N - 1], N ∈ {4,5,6} correspond to the position within the grid,
+/// and the batch dimension combines both the position within a spatial block and the original batch position.
+/// Prior to division into blocks, the spatial dimensions of the input are optionally zero padded according to pads_begin and pads_end.
+/// @details The SpaceToBatch operation is similar to the TensorFlow* operation SpaceToBatchND (https://www.tensorflow.org/api_docs/python/tf/space_to_batch_nd)
+/// There are 4 inputs of this operation:
+/// 1) data - input N-D tensor [batch, D_1, D_2 ... D_{N-1}], N ∈ {4,5,6}. Required.
+/// 2) block_shape - input 1-D tensor with shape [N], N ∈ {4,5,6}. Consists of block_sizes each of which specifies the size of the value block to be moved.
+/// All values must be >= 1 and required. block_shape[0] is expected to be 1.
+/// 3) pads_begin - input 1-D tensor with shape [N], N ∈ {4,5,6}. Specifies the padding for the beginning along each axis of data input.
+/// All values must be non-negative and required. pads_begin[0] is expected to be 0.
+/// 4) pads_end - input 1-D tensor with shape [N], N ∈ {4,5,6}. Specifies the padding for the ending along each axis of data input.
+/// All values must be non-negative and required. pads_end[0] is expected to be 0.
+/// 3-4 inputs required that block_shape[i] divides data_shape[i] + pads_begin[i] + pads_end[i]
+///
+/// The operation is equivalent to the following transformation of the input tensor data of shape [batch, D_1, D_2 ... D_{N - 1}], N ∈ {4,5,6}
+/// and block_shape, pads_begin, pads_end of shapes [N] to Y output tensor.
+/// Zero-pad the start and end of dimensions [D_0, ..., D_{N - 1}] of the input according to `pads_begin` and `pads_end`
+///
+/// x' = reshape(x, [batch, (D_1 + P_1) / B_1, B_1, (D_2 + P_2) / B_2, B_2, ..., (D_{N - 1} + P_{N - 1}) / B_{N - 1}, B_{N - 1}]), where B_i = block_shape[i]
+///
+/// x'' = transpose(x',  [2, 4, ..., (N - 1) + (N - 1), 0, 1, 3, ..., N + (N - 1)])
+///
+/// y = reshape(x'', [batch * B_1 * ... * B_{N - 1}, (D_1 + P_1) / B_1, (D_2 + P_2) / B_2, ... , (D_{N - 1} + P_{N - 1}) / B_{N - 1}])
+
+struct space_to_batch : public primitive_base<space_to_batch> {
+    CLDNN_DECLARE_PRIMITIVE(space_to_batch)
+
+    /// @brief Constructs space_to_batch primitive.
+    /// @param id This primitive id.
+    /// @param input Input data primitive id.
+    /// @param block_shape Array of block sizes.
+    /// @param pads_begin Amount to pad for the beginning along each axis of data input.
+    /// @param pads_end Amount to pad for the ending along each axis of data input.
+    /// @param out_size Size of output tensor.
+    space_to_batch(const primitive_id& id,
+                   const primitive_id& input,
+                   const tensor& block_shape,
+                   const tensor& pads_begin,
+                   const tensor& pads_end,
+                   const tensor& out_size,
+                   const padding& output_padding = padding())
+        : primitive_base(id, {input}, output_padding),
+          block_shape(block_shape),
+          pads_begin(pads_begin),
+          pads_end(pads_end),
+          out_size(out_size) {}
+
+    tensor block_shape;
+    tensor pads_begin;
+    tensor pads_end;
+    tensor out_size;
+};
+/// @}
+/// @}
+/// @}
+}  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h
@@ -81,6 +81,7 @@ enum class KernelType {
    REDUCE,
    GATHER_TREE,
    SPACE_TO_DEPTH,
+    SPACE_TO_BATCH,
    GRN,
    CTC_GREEDY_DECODER,
    CUM_SUM,
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_base.cpp
@@ -0,0 +1,102 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "space_to_batch_kernel_base.h"
+#include "kernel_selector_utils.h"
+#include <string>
+
+namespace kernel_selector {
+
+bool SpaceToBatchKernelBase::Validate(const Params& p, const optional_params& o) const {
+    if (p.GetType() != KernelType::SPACE_TO_BATCH ||
+        o.GetType() != KernelType::SPACE_TO_BATCH) {
+        return false;
+    }
+
+    return true;
+}
+
+CommonDispatchData SpaceToBatchKernelBase::SetDefault(const space_to_batch_params& params, const optional_params&) const {
+    CommonDispatchData runInfo;
+
+    std::vector<size_t> global = { params.output.Batch().v,
+                                   params.output.Feature().v,
+                                   params.output.W().v * params.output.Z().v * params.output.Y().v * params.output.X().v };
+
+    auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+
+    runInfo.gws0 = global[0];
+    runInfo.gws1 = global[1];
+    runInfo.gws2 = global[2];
+
+    runInfo.lws0 = local[0];
+    runInfo.lws1 = local[1];
+    runInfo.lws2 = local[2];
+
+    return runInfo;
+}
+
+JitConstants SpaceToBatchKernelBase::GetJitConstants(const space_to_batch_params& params) const {
+    JitConstants jit = MakeBaseParamsJitConstants(params);
+
+    auto makeJitConstForParam = [](JitConstants& jit, const std::string name, const DimTensor<uint32_t>& args, const size_t default_value) {
+        jit.AddConstant(MakeJitConstant(name + "_SIZES", args));
+        jit.AddConstant(MakeJitConstant(name + "_BATCH", args.b));
+        jit.AddConstant(MakeJitConstant(name + "_FEATURE", args.f));
+        jit.AddConstant(MakeJitConstant(name + "_Y", args.y));
+        jit.AddConstant(MakeJitConstant(name + "_X", args.x));
+
+        if (args.w != 0) {
+            jit.AddConstant(MakeJitConstant(name + "_W", args.w));
+            jit.AddConstant(MakeJitConstant(name + "_Z", args.z));
+        } else if(args.z != 0) {
+            jit.AddConstant(MakeJitConstant(name + "_W", default_value));
+            jit.AddConstant(MakeJitConstant(name + "_Z", args.z));
+        } else {
+            jit.AddConstant(MakeJitConstant(name + "_W", default_value));
+            jit.AddConstant(MakeJitConstant(name + "_Z", default_value));
+        }
+    };
+
+    makeJitConstForParam(jit, "BLOCK_SHAPE", params.block_shape, 1);
+    makeJitConstForParam(jit, "PADS_BEGIN", params.pads_begin, 0);
+    makeJitConstForParam(jit, "PADS_END", params.pads_end, 0);
+
+    return jit;
+}
+
+KernelsData SpaceToBatchKernelBase::GetCommonKernelsData(const Params& params, const optional_params& options, float estimatedTime) const {
+    KernelData kd = KernelData::Default<space_to_batch_params>(params);
+    space_to_batch_params& newParams = *static_cast<space_to_batch_params*>(kd.params.get());
+
+    if (!Validate(params, options)) {
+        return {};
+    }
+
+    auto runInfo = SetDefault(newParams, options);
+    auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
+    auto cldnn_jit = GetJitConstants(newParams);
+    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+
+    auto& kernel = kd.kernels[0];
+
+    FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
+
+    kd.estimatedTime = estimatedTime;
+
+    return { kd };
+}
+}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_base.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_base.h
@@ -0,0 +1,63 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "common_kernel_base.h"
+#include "kernel_selector_params.h"
+#include <vector>
+
+namespace kernel_selector {
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// space_to_batch_params
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct space_to_batch_params : public base_params {
+    space_to_batch_params() : base_params(KernelType::SPACE_TO_BATCH) {}
+    DimTensor<uint32_t> block_shape;
+    DimTensor<uint32_t> pads_begin;
+    DimTensor<uint32_t> pads_end;
+
+    virtual ParamsKey GetParamsKey() const { return base_params::GetParamsKey(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// space_to_batch_optional_params
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct space_to_batch_optional_params : optional_params {
+    space_to_batch_optional_params() : optional_params(KernelType::SPACE_TO_BATCH) {}
+};
+
+struct space_to_batch_fuse_params : fuse_params {
+    space_to_batch_fuse_params() : fuse_params(KernelType::SPACE_TO_BATCH) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// SpaceToBatchKernelBase
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+class SpaceToBatchKernelBase : public common_kernel_base {
+public:
+    using common_kernel_base::common_kernel_base;
+    virtual ~SpaceToBatchKernelBase() {}
+
+    struct DispatchData : public CommonDispatchData {};
+
+protected:
+    virtual bool Validate(const Params&, const optional_params&) const;
+    virtual JitConstants GetJitConstants(const space_to_batch_params& params) const;
+    virtual CommonDispatchData SetDefault(const space_to_batch_params& params, const optional_params&) const;
+    KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimatedTime) const;
+};
+}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_ref.cpp
@@ -0,0 +1,46 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "space_to_batch_kernel_ref.h"
+#include "kernel_selector_utils.h"
+#include <string>
+#include <vector>
+
+namespace kernel_selector {
+ParamsKey SpaceToBatchKernelRef::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableInputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
+
+    k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableInputLayout(DataLayout::bfzyx);
+    k.EnableInputLayout(DataLayout::bfwzyx);
+    k.EnableOutputLayout(DataLayout::bfyx);
+    k.EnableOutputLayout(DataLayout::bfzyx);
+    k.EnableOutputLayout(DataLayout::bfwzyx);
+
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBatching();
+    return k;
+}
+
+KernelsData SpaceToBatchKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
+    return GetCommonKernelsData(params, options, FORCE_PRIORITY_9);
+}
+}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_ref.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_ref.h
@@ -0,0 +1,30 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "space_to_batch_kernel_base.h"
+
+namespace kernel_selector {
+class SpaceToBatchKernelRef : public SpaceToBatchKernelBase {
+public:
+    SpaceToBatchKernelRef() : SpaceToBatchKernelBase("space_to_batch_ref") {}
+    virtual ~SpaceToBatchKernelRef() {}
+
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+};
+}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_selector.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_selector.cpp
@@ -0,0 +1,29 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "space_to_batch_kernel_selector.h"
+#include "space_to_batch_kernel_ref.h"
+
+namespace kernel_selector {
+
+space_to_batch_kernel_selector::space_to_batch_kernel_selector() {
+    Attach<SpaceToBatchKernelRef>();
+}
+
+KernelsData space_to_batch_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
+    return GetNaiveBestKernel(params, options, KernelType::SPACE_TO_BATCH);
+}
+}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_selector.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_selector.h
@@ -0,0 +1,35 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "kernel_selector.h"
+
+namespace kernel_selector {
+class space_to_batch_kernel_selector : public kernel_selector_base {
+public:
+    static space_to_batch_kernel_selector& Instance() {
+        static space_to_batch_kernel_selector instance_;
+        return instance_;
+    }
+
+    space_to_batch_kernel_selector();
+
+    virtual ~space_to_batch_kernel_selector() {}
+
+    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
+};
+}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/space_to_batch_ref.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/space_to_batch_ref.cl
@@ -0,0 +1,73 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "include/include_all.cl"
+
+KERNEL(space_to_batch_ref)(const __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output)
+{
+    const uint batch = get_global_id(0);
+    const uint feature = get_global_id(1);
+
+#ifdef OUTPUT_LAYOUT_BFYX
+    const uint w = 0;
+    const uint z = 0;
+    const uint y = (uint)get_global_id(2) / OUTPUT_SIZE_X;
+    const uint x = (uint)get_global_id(2) % OUTPUT_SIZE_X;
+#elif OUTPUT_LAYOUT_BFZYX
+    const uint w = 0;
+    const uint yx = (uint)get_global_id(2) % (OUTPUT_SIZE_X * OUTPUT_SIZE_Y);
+    const uint z = (uint)get_global_id(2) / (OUTPUT_SIZE_X * OUTPUT_SIZE_Y);
+    const uint y = yx / OUTPUT_SIZE_X;
+    const uint x = yx % OUTPUT_SIZE_X;
+#elif OUTPUT_LAYOUT_BFWZYX
+    const uint zyx = (uint)get_global_id(2) % (OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_Z);
+    const uint w = (uint)get_global_id(2) / (OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_Z);
+    const uint yx = zyx % (OUTPUT_SIZE_X * OUTPUT_SIZE_Y);
+    const uint z = zyx / (OUTPUT_SIZE_X * OUTPUT_SIZE_Y);
+    const uint y = yx / OUTPUT_SIZE_X;
+    const uint x = yx % OUTPUT_SIZE_X;
+#endif
+
+    const uint input_batch = batch % INPUT0_BATCH_NUM;
+    const uint offset_batch =  batch / INPUT0_BATCH_NUM;
+
+    const int input_feature = feature * BLOCK_SHAPE_FEATURE - PADS_BEGIN_FEATURE +
+                              offset_batch / (BLOCK_SHAPE_W * BLOCK_SHAPE_Z * BLOCK_SHAPE_Y * BLOCK_SHAPE_X);
+    const uint offset_feature = offset_batch % (BLOCK_SHAPE_W * BLOCK_SHAPE_Z * BLOCK_SHAPE_Y * BLOCK_SHAPE_X);
+
+    const int input_w = w * BLOCK_SHAPE_W - PADS_BEGIN_W + offset_feature / (BLOCK_SHAPE_Z * BLOCK_SHAPE_Y * BLOCK_SHAPE_X);
+    const uint offset_w = offset_feature % (BLOCK_SHAPE_Z * BLOCK_SHAPE_Y * BLOCK_SHAPE_X);
+
+    const int input_z = z * BLOCK_SHAPE_Z - PADS_BEGIN_Z + offset_w / (BLOCK_SHAPE_Y * BLOCK_SHAPE_X);
+    const uint offset_z = offset_w % (BLOCK_SHAPE_Y * BLOCK_SHAPE_X);
+
+    const int input_y = y * BLOCK_SHAPE_Y - PADS_BEGIN_Y + offset_z / BLOCK_SHAPE_X;
+    const uint offset_y = offset_z % BLOCK_SHAPE_X;
+
+    const int input_x = x * BLOCK_SHAPE_X - PADS_BEGIN_X + offset_y;
+
+    const int input_index = GET_DATA_INDEX_6D(INPUT0, input_batch, input_feature, input_w, input_z, input_y, input_x);
+
+    const uint output_index = GET_DATA_INDEX_6D(OUTPUT, batch, feature, w, z, y, x);
+
+    const bool out_of_bounds = input_feature < 0 || input_feature >= INPUT0_FEATURE_NUM ||
+                               input_w < 0 || input_w >= INPUT0_SIZE_W ||
+                               input_z < 0 || input_z >= INPUT0_SIZE_Z ||
+                               input_y < 0 || input_y >= INPUT0_SIZE_Y ||
+                               input_x < 0 || input_x >= INPUT0_SIZE_X;
+
+    INPUT0_TYPE in = out_of_bounds ? INPUT0_VAL_ZERO : input[input_index];
+    output[output_index] = ACTIVATION(in, ACTIVATION_PARAMS);
+}
--- a/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.cpp
@@ -85,6 +85,7 @@ void register_implementations_gpu() {
    REGISTER_GPU(shuffle_channels);
    REGISTER_GPU(softmax);
    REGISTER_GPU(softmax_loss_grad);
+    REGISTER_GPU(space_to_batch);
    REGISTER_GPU(space_to_depth);
    REGISTER_GPU(strided_slice);
    REGISTER_GPU(tile);
--- a/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.hpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.hpp
@@ -77,6 +77,7 @@
 #include "api/shuffle_channels.hpp"
 #include "api/softmax.hpp"
 #include "api/softmax_loss_grad.hpp"
+#include "api/space_to_batch.hpp"
 #include "api/strided_slice.hpp"
 #include "api/tile.hpp"
 #include "api/resample.hpp"
@@ -163,6 +164,7 @@ REGISTER_GPU(select);
 REGISTER_GPU(shuffle_channels);
 REGISTER_GPU(softmax);
 REGISTER_GPU(softmax_loss_grad);
+REGISTER_GPU(space_to_batch);
 REGISTER_GPU(space_to_depth);
 REGISTER_GPU(strided_slice);
 REGISTER_GPU(tile);
--- a/inference-engine/thirdparty/clDNN/src/gpu/space_to_batch_gpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/space_to_batch_gpu.cpp
@@ -0,0 +1,75 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "space_to_batch_inst.h"
+#include "primitive_gpu_base.h"
+#include "implementation_map.h"
+#include "kernel_selector_helper.h"
+#include "space_to_batch/space_to_batch_kernel_selector.h"
+#include "space_to_batch/space_to_batch_kernel_ref.h"
+#include "error_handler.h"
+#include "data_inst.h"
+#include <vector>
+
+using namespace cldnn;
+
+namespace cldnn {
+namespace gpu {
+struct space_to_batch_gpu : typed_primitive_gpu_impl<space_to_batch> {
+    using parent = typed_primitive_gpu_impl<space_to_batch>;
+    using parent::parent;
+
+public:
+    static primitive_impl* create(const space_to_batch_node& arg) {
+        auto space_to_batch_params = get_default_params<kernel_selector::space_to_batch_params>(arg);
+        auto space_to_batch_optional_params =
+            get_default_optional_params<kernel_selector::space_to_batch_optional_params>(arg.get_program());
+
+        auto primitive = arg.get_primitive();
+
+        space_to_batch_params.block_shape = convert_dim_vector(primitive->block_shape);
+        space_to_batch_params.pads_begin = convert_dim_vector(primitive->pads_begin);
+        space_to_batch_params.pads_end = convert_dim_vector(primitive->pads_end);
+
+        auto& kernel_selector = kernel_selector::space_to_batch_kernel_selector::Instance();
+        auto best_kernels = kernel_selector.GetBestKernels(space_to_batch_params, space_to_batch_optional_params);
+
+        CLDNN_ERROR_BOOL(arg.id(),
+                         "Best_kernel.empty()",
+                         best_kernels.empty(),
+                         "Cannot find a proper kernel with this arguments");
+
+        auto space_to_batch = new space_to_batch_gpu(arg, best_kernels[0]);
+
+        return space_to_batch;
+    }
+};
+
+namespace detail {
+
+attach_space_to_batch_gpu::attach_space_to_batch_gpu() {
+    auto val_fw = space_to_batch_gpu::create;
+    implementation_map<space_to_batch>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw);
+    implementation_map<space_to_batch>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw);
+    implementation_map<space_to_batch>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfzyx), val_fw);
+    implementation_map<space_to_batch>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfzyx), val_fw);
+    implementation_map<space_to_batch>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfwzyx), val_fw);
+    implementation_map<space_to_batch>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfwzyx), val_fw);
+}
+
+}  // namespace detail
+}  // namespace gpu
+}  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
@@ -49,6 +49,7 @@
 #include "gather_inst.h"
 #include "reverse_sequence_inst.h"
 #include "shuffle_channels_inst.h"
+#include "space_to_batch_inst.h"
 #include "strided_slice_inst.h"
 #include "cum_sum_inst.h"
 #include "embedding_bag_inst.h"
@@ -200,10 +201,10 @@ void prepare_primitive_fusing::fuse_activations(program_impl &p) {
                 !input.is_type<permute>() && !input.is_type<pooling>() && !input.is_type<reorder>() &&
                 !input.is_type<reshape>() && !input.is_type<roi_pooling>() && !input.is_type<scale>() &&
                 !input.is_type<softmax>() && !input.is_type<resample>() && !input.is_type<mvn>() &&
-                 !input.is_type<depth_to_space>() && !input.is_type<batch_to_space>() && !input.is_type<gather>() &&
-                 !input.is_type<shuffle_channels>() && !input.is_type<strided_slice>() && !input.is_type<cum_sum>() &&
-                 !input.is_type<reverse_sequence>() && !input.is_type<embedding_bag>() && !input.is_type<fused_conv_eltwise>() &&
-                 !input.is_type<activation>()))
+                 !input.is_type<depth_to_space>() && !input.is_type<batch_to_space>() && !input.is_type<space_to_batch>() &&
+                 !input.is_type<gather>() && !input.is_type<shuffle_channels>() && !input.is_type<strided_slice>() &&
+                 !input.is_type<cum_sum>() && !input.is_type<reverse_sequence>() && !input.is_type<embedding_bag>() &&
+                 !input.is_type<fused_conv_eltwise>() && !input.is_type<activation>()))
                return;

            if (input.is_type<eltwise>()) {
--- a/inference-engine/thirdparty/clDNN/src/include/space_to_batch_inst.h
+++ b/inference-engine/thirdparty/clDNN/src/include/space_to_batch_inst.h
@@ -0,0 +1,49 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#pragma once
+#include "api/space_to_batch.hpp"
+#include "primitive_inst.h"
+#include <string>
+
+namespace cldnn {
+template <>
+struct typed_program_node<space_to_batch> : public typed_program_node_base<space_to_batch> {
+    using parent = typed_program_node_base<space_to_batch>;
+
+public:
+    using parent::parent;
+
+    program_node& input(size_t index = 0) const { return get_dependency(index); }
+};
+
+using space_to_batch_node = typed_program_node<space_to_batch>;
+
+template <>
+class typed_primitive_inst<space_to_batch> : public typed_primitive_inst_base<space_to_batch> {
+    using parent = typed_primitive_inst_base<space_to_batch>;
+
+public:
+    static layout calc_output_layout(space_to_batch_node const& node);
+    static std::string to_string(space_to_batch_node const& node);
+
+public:
+    typed_primitive_inst(network_impl& network, space_to_batch_node const& desc);
+};
+
+using space_to_batch_inst = typed_primitive_inst<space_to_batch>;
+}  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/space_to_batch.cpp
+++ b/inference-engine/thirdparty/clDNN/src/space_to_batch.cpp
@@ -0,0 +1,90 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "space_to_batch_inst.h"
+
+#include "primitive_type_base.h"
+#include "error_handler.h"
+#include "json_object.h"
+#include "data_inst.h"
+#include <string>
+#include <vector>
+
+namespace cldnn {
+primitive_type_id cldnn::space_to_batch::type_id() {
+    static primitive_type_base<space_to_batch> instance;
+    return &instance;
+}
+
+layout space_to_batch_inst::calc_output_layout(space_to_batch_node const& node) {
+    auto desc = node.get_primitive();
+
+    auto input_layout = node.input(0).get_output_layout();
+    auto input_format = input_layout.format;
+
+    const size_t dims_num = format::dimension(input_format);
+
+    const auto& block_shape = desc->block_shape;
+    const auto& pads_begin =  desc->pads_begin;
+    const auto& pads_end =  desc->pads_end;
+
+    if (block_shape.batch[0] != 1)
+        CLDNN_ERROR_MESSAGE(node.id(),
+            "block_shape[0] is expected to be 1. Actual block_shape[0] is " +
+            std::to_string(block_shape.batch[0]));
+
+    if (pads_begin.batch[0] != 0)
+        CLDNN_ERROR_MESSAGE(node.id(),
+            "pads_begin[0] is expected to be 0. Actual pads_begin[0] is " +
+            std::to_string(pads_begin.batch[0]));
+
+    if (pads_end.batch[0] != 0)
+        CLDNN_ERROR_MESSAGE(node.id(),
+            "pads_end[0] is expected to be 0. Actual pads_end[0] is " +
+            std::to_string(pads_end.batch[0]));
+
+    if ((input_layout.size.sizes(input_format)[1] + pads_begin.feature[0] + pads_end.feature[0]) % block_shape.feature[0] != 0)
+            CLDNN_ERROR_MESSAGE(node.id(),
+                "Input feature shape after padding must be divisible by block_shape");
+
+    for (size_t i = 2; i < dims_num; ++i)
+        if ((input_layout.size.sizes(input_format)[dims_num - i + 1] + pads_begin.sizes()[i] + pads_end.sizes()[i]) % block_shape.sizes()[i] != 0)
+            CLDNN_ERROR_MESSAGE(node.id(),
+                "Input spatial shapes after padding must be divisible by block_shape");
+
+    return layout{input_layout.data_type, input_format, desc->out_size};
+}
+
+std::string space_to_batch_inst::to_string(space_to_batch_node const& node) {
+    auto desc = node.get_primitive();
+    auto node_info = node.desc_to_json();
+    auto& input = node.input();
+
+    std::stringstream primitive_description;
+
+    json_composite space_to_batch_info;
+    space_to_batch_info.add("input id", input.id());
+
+    node_info->add("space_to_batch_info", space_to_batch_info);
+    node_info->dump(primitive_description);
+
+    return primitive_description.str();
+}
+
+space_to_batch_inst::typed_primitive_inst(network_impl& network, space_to_batch_node const& node)
+    : parent(network, node) {}
+
+}  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/space_to_batch_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/space_to_batch_gpu_test.cpp
@@ -0,0 +1,495 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include <gtest/gtest.h>
+
+#include <api/input_layout.hpp>
+#include <api/memory.hpp>
+#include <api/space_to_batch.hpp>
+#include <api/topology.hpp>
+#include <api/network.hpp>
+#include <api/data.hpp>
+
+#include <cstddef>
+#include <tests/test_utils/test_utils.h>
+
+using namespace cldnn;
+using namespace ::tests;
+
+TEST(space_to_batch_fp16_gpu, i1222_bs1222_pb0000_pe0000) {
+    // Input :       1x2x2x2
+    // Block shape : 1x2x2x2
+    // Pads begin :  0x0x0x0
+    // Pads end :    0x0x0x0
+    // Output :      8x1x1x1
+    // Input values in fp16
+
+    engine engine;
+    auto input = memory::allocate(engine, { data_types::f16, format::bfyx, {1,2,2,2} });
+
+    set_values(input, {
+        FLOAT16(0.0f), FLOAT16(1.0f),
+        FLOAT16(2.0f), FLOAT16(3.0f),
+        FLOAT16(4.0f), FLOAT16(5.0f),
+        FLOAT16(6.0f), FLOAT16(7.0f)
+    });
+
+    topology topology;
+    topology.add(input_layout("Input", input.get_layout()));
+    topology.add(space_to_batch("space_to_batch", "Input", tensor(format::bfyx, {1,2,2,2}, 1),
+                                                           tensor(format::bfyx, {0,0,0,0}, 0),
+                                                           tensor(format::bfyx, {0,0,0,0}, 0),
+                                                           tensor(format::bfyx, {8,1,1,1}, 1)));
+    network network(engine, topology);
+
+    network.set_input_data("Input", input);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("space_to_batch").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    std::vector<float> expected_results = {
+        0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f
+    };
+
+    ASSERT_EQ(output_ptr.size(), expected_results.size());
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
+    }
+}
+
+TEST(space_to_batch_fp16_gpu, i1242_bs1221_pb0020_pe0000) {
+    // Input :       1x2x4x2
+    // Block shape : 1x2x2x1
+    // Pads begin :  0x0x2x0
+    // Pads end :    0x0x0x0
+    // Output :      4x1x3x2
+    // Input values in fp16
+
+    engine engine;
+    auto input = memory::allocate(engine, { data_types::f16, format::bfyx, {1,2,2,4} });
+
+    set_values(input, {
+        FLOAT16(0.0f), FLOAT16(1.0f), FLOAT16(2.0f), FLOAT16(3.0f),
+        FLOAT16(4.0f), FLOAT16(5.0f), FLOAT16(6.0f), FLOAT16(7.0f),
+        FLOAT16(8.0f), FLOAT16(9.0f), FLOAT16(10.0f), FLOAT16(11.0f),
+        FLOAT16(12.0f), FLOAT16(13.0f), FLOAT16(14.0f), FLOAT16(15.0f)
+    });
+
+    topology topology;
+    topology.add(input_layout("Input", input.get_layout()));
+    topology.add(space_to_batch("space_to_batch", "Input", tensor(format::bfyx, {1,2,2,1}, 1),
+                                                           tensor(format::bfyx, {0,0,2,0}, 0),
+                                                           tensor(format::bfyx, {0,0,0,0}, 0),
+                                                           tensor(format::bfyx, {4,1,3,2}, 1)));
+    network network(engine, topology);
+
+    network.set_input_data("Input", input);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("space_to_batch").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    std::vector<float> expected_results = {
+        0.f, 0.f, 0.f, 1.f, 4.f, 5.f,
+        0.f, 0.f, 2.f, 3.f, 6.f, 7.f,
+        0.f, 0.f, 8.f, 9.f, 12.f, 13.f,
+        0.f, 0.f, 10.f, 11.f, 14.f, 15.f
+    };
+
+    ASSERT_EQ(output_ptr.size(), expected_results.size());
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
+    }
+}
+
+TEST(space_to_batch_fp16_gpu, i2132_bs1222_pb0010_pe0100) {
+    // Input :       2x1x3x2
+    // Block shape : 1x2x2x2
+    // Pads begin :  0x0x1x0
+    // Pads end :    0x1x0x0
+    // Output :      16x1x2x1
+    // Input values in fp16
+
+    engine engine;
+    auto input = memory::allocate(engine, { data_types::f16, format::bfyx, {2,1,2,3} });
+
+    set_values(input, {
+        FLOAT16(0.0f), FLOAT16(1.0f), FLOAT16(2.0f), FLOAT16(3.0f),
+        FLOAT16(4.0f), FLOAT16(5.0f), FLOAT16(6.0f), FLOAT16(7.0f),
+        FLOAT16(8.0f), FLOAT16(9.0f), FLOAT16(10.0f), FLOAT16(11.0f)
+    });
+
+    topology topology;
+    topology.add(input_layout("Input", input.get_layout()));
+    topology.add(space_to_batch("space_to_batch", "Input", tensor(format::bfyx, {1,2,2,2}, 1),
+                                                           tensor(format::bfyx, {0,0,1,0}, 0),
+                                                           tensor(format::bfyx, {0,1,0,0}, 0),
+                                                           tensor(format::bfyx, {16,1,2,1}, 1)));
+    network network(engine, topology);
+
+    network.set_input_data("Input", input);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("space_to_batch").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    std::vector<float> expected_results = {
+        0.f, 2.f, 0.f, 8.f, 0.f, 3.f, 0.f, 9.f,
+        0.f, 4.f, 6.f, 10.f, 1.f, 5.f, 7.f, 11.f,
+        0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+        0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f
+    };
+
+    ASSERT_EQ(output_ptr.size(), expected_results.size());
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
+    }
+}
+
+TEST(space_to_batch_fp16_gpu, i12132_bs12122_pb00010_pe00000) {
+    // Input :       1x2x1x3x2
+    // Block shape : 1x2x1x2x2
+    // Pads begin :  0x0x0x1x0
+    // Pads end :    0x0x0x0x0
+    // Output :      8x1x1x2x1
+    // Input values in fp16
+
+    engine engine;
+    auto input = memory::allocate(engine, { data_types::f16, format::bfzyx, {1,2,2,3,1} });
+
+    set_values(input, {
+        FLOAT16(0.0f), FLOAT16(1.0f), FLOAT16(2.0f), FLOAT16(3.0f),
+        FLOAT16(4.0f), FLOAT16(5.0f), FLOAT16(6.0f), FLOAT16(7.0f),
+        FLOAT16(8.0f), FLOAT16(9.0f), FLOAT16(10.0f), FLOAT16(11.0f)
+    });
+
+    topology topology;
+    topology.add(input_layout("Input", input.get_layout()));
+    topology.add(space_to_batch("space_to_batch", "Input", tensor(format::bfzyx, {1,2,1,2,2}, 1),
+                                                           tensor(format::bfzyx, {0,0,0,1,0}, 0),
+                                                           tensor(format::bfzyx, {0,0,0,0,0}, 0),
+                                                           tensor(format::bfzyx, {8,1,1,2,1}, 1)));
+    network network(engine, topology);
+
+    network.set_input_data("Input", input);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("space_to_batch").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    std::vector<float> expected_results = {
+        0.f, 2.f, 0.f, 3.f, 0.f, 4.f, 1.f, 5.f,
+        0.f, 8.f, 0.f, 9.f, 6.f, 10.f, 7.f, 11.f
+    };
+
+    ASSERT_EQ(output_ptr.size(), expected_results.size());
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
+    }
+}
+
+TEST(space_to_batch_fp16_gpu, i134121_bs142121_pb010100_pe000000) {
+    // Input :       1x3x4x1x2x1
+    // Block shape : 1x4x2x1x2x1
+    // Pads begin :  0x1x0x1x0x0
+    // Pads end :    0x0x0x0x0x0
+    // Output :      16x1x2x2x1x1
+    // Input values in fp16
+
+    engine engine;
+    tensor input_shape = tensor{ batch(1), feature(3), spatial(1, 2, 1, 4) };
+    auto input = memory::allocate(engine, { data_types::f16, format::bfwzyx, input_shape });
+
+    set_values(input, {
+        FLOAT16(0.0f), FLOAT16(1.0f), FLOAT16(2.0f), FLOAT16(3.0f),
+        FLOAT16(4.0f), FLOAT16(5.0f), FLOAT16(6.0f), FLOAT16(7.0f),
+        FLOAT16(8.0f), FLOAT16(9.0f), FLOAT16(10.0f), FLOAT16(11.0f),
+        FLOAT16(12.0f), FLOAT16(13.0f), FLOAT16(14.0f), FLOAT16(15.0f),
+        FLOAT16(16.0f), FLOAT16(17.0f), FLOAT16(18.0f), FLOAT16(19.0f),
+        FLOAT16(20.0f), FLOAT16(21.0f), FLOAT16(22.0f), FLOAT16(23.0f)
+    });
+
+    topology topology;
+    topology.add(input_layout("Input", input.get_layout()));
+    topology.add(space_to_batch("space_to_batch", "Input", tensor(format::bfwzyx, {1,4,2,1,2,1}, 1),
+                                                           tensor(format::bfwzyx, {0,1,0,1,0,0}, 0),
+                                                           tensor(format::bfwzyx, {0,0,0,0,0,0}, 0),
+                                                           tensor(format::bfwzyx, {16,1,2,2,1,1}, 1)));
+    network network(engine, topology);
+
+    network.set_input_data("Input", input);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("space_to_batch").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    std::vector<float> expected_results = {
+        0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+        0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+        0.f, 0.f, 0.f, 4.f, 0.f, 1.f, 0.f, 5.f,
+        0.f, 2.f, 0.f, 6.f, 0.f, 3.f, 0.f, 7.f,
+        0.f, 8.f, 0.f, 12.f, 0.f, 9.f, 0.f, 13.f,
+        0.f, 10.f, 0.f, 14.f, 0.f, 11.f, 0.f, 15.f,
+        0.f, 16.f, 0.f, 20.f, 0.f, 17.f, 0.f, 21.f,
+        0.f, 18.f, 0.f, 22.f, 0.f, 19.f, 0.f, 23.f
+    };
+
+    ASSERT_EQ(output_ptr.size(), expected_results.size());
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
+    }
+}
+
+TEST(space_to_batch_fp32_gpu, i1222_bs1222_pb0000_pe0000) {
+    // Input :       1x2x2x2
+    // Block shape : 1x2x2x2
+    // Pads begin :  0x0x0x0
+    // Pads end :    0x0x0x0
+    // Output :      8x1x1x1
+    // Input values in fp32
+
+    engine engine;
+    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, {1,2,2,2} });
+
+    set_values(input, {
+        0.0f, 1.0f, 2.0f, 3.0f,
+        4.0f, 5.0f, 6.0f, 7.0f
+    });
+
+    topology topology;
+    topology.add(input_layout("Input", input.get_layout()));
+    topology.add(space_to_batch("space_to_batch", "Input", tensor(format::bfyx, {1,2,2,2}, 1),
+                                                           tensor(format::bfyx, {0,0,0,0}, 0),
+                                                           tensor(format::bfyx, {0,0,0,0}, 0),
+                                                           tensor(format::bfyx, {8,1,1,1}, 1)));
+    network network(engine, topology);
+
+    network.set_input_data("Input", input);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("space_to_batch").get_memory();
+    auto output_ptr = output.pointer<float>();
+
+    std::vector<float> expected_results = {
+        0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f
+    };
+
+    ASSERT_EQ(output_ptr.size(), expected_results.size());
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_EQ(expected_results[i], output_ptr[i]);
+    }
+}
+
+TEST(space_to_batch_fp32_gpu, i1242_bs1221_pb0020_pe0000) {
+    // Input :       1x2x4x2
+    // Block shape : 1x2x2x1
+    // Pads begin :  0x0x2x0
+    // Pads end :    0x0x0x0
+    // Output :      4x1x3x2
+    // Input values in fp32
+
+    engine engine;
+    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, {1,2,2,4} });
+
+    set_values(input, {
+        0.0f, 1.0f, 2.0f, 3.0f,
+        4.0f, 5.0f, 6.0f, 7.0f,
+        8.0f, 9.0f, 10.0f, 11.0f,
+        12.0f, 13.0f, 14.0f, 15.0f
+    });
+
+    topology topology;
+    topology.add(input_layout("Input", input.get_layout()));
+    topology.add(space_to_batch("space_to_batch", "Input", tensor(format::bfyx, {1,2,2,1}, 1),
+                                                           tensor(format::bfyx, {0,0,2,0}, 0),
+                                                           tensor(format::bfyx, {0,0,0,0}, 0),
+                                                           tensor(format::bfyx, {4,1,3,2}, 1)));
+    network network(engine, topology);
+
+    network.set_input_data("Input", input);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("space_to_batch").get_memory();
+    auto output_ptr = output.pointer<float>();
+
+    std::vector<float> expected_results = {
+        0.f, 0.f, 0.f, 1.f, 4.f, 5.f,
+        0.f, 0.f, 2.f, 3.f, 6.f, 7.f,
+        0.f, 0.f, 8.f, 9.f, 12.f, 13.f,
+        0.f, 0.f, 10.f, 11.f, 14.f, 15.f
+    };
+
+    ASSERT_EQ(output_ptr.size(), expected_results.size());
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_EQ(expected_results[i], output_ptr[i]);
+    }
+}
+
+TEST(space_to_batch_fp32_gpu, i2132_bs1222_pb0010_pe0100) {
+    // Input :       2x1x3x2
+    // Block shape : 1x2x2x2
+    // Pads begin :  0x0x1x0
+    // Pads end :    0x1x0x0
+    // Output :      16x1x2x1
+    // Input values in fp32
+
+    engine engine;
+    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, {2,1,2,3} });
+
+    set_values(input, {
+        0.0f, 1.0f, 2.0f, 3.0f,
+        4.0f, 5.0f, 6.0f, 7.0f,
+        8.0f, 9.0f, 10.0f, 11.0f
+    });
+
+    topology topology;
+    topology.add(input_layout("Input", input.get_layout()));
+    topology.add(space_to_batch("space_to_batch", "Input", tensor(format::bfyx, {1,2,2,2}, 1),
+                                                           tensor(format::bfyx, {0,0,1,0}, 0),
+                                                           tensor(format::bfyx, {0,1,0,0}, 0),
+                                                           tensor(format::bfyx, {16,1,2,1}, 1)));
+    network network(engine, topology);
+
+    network.set_input_data("Input", input);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("space_to_batch").get_memory();
+    auto output_ptr = output.pointer<float>();
+
+    std::vector<float> expected_results = {
+        0.f, 2.f, 0.f, 8.f, 0.f, 3.f, 0.f, 9.f,
+        0.f, 4.f, 6.f, 10.f, 1.f, 5.f, 7.f, 11.f,
+        0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+        0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f
+    };
+
+    ASSERT_EQ(output_ptr.size(), expected_results.size());
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_EQ(expected_results[i], output_ptr[i]);
+    }
+}
+
+TEST(space_to_batch_fp32_gpu, i12132_bs12122_pb00010_pe00000) {
+    // Input :       1x2x1x3x2
+    // Block shape : 1x2x1x2x2
+    // Pads begin :  0x0x0x1x0
+    // Pads end :    0x0x0x0x0
+    // Output :      8x1x1x2x1
+    // Input values in fp32
+
+    engine engine;
+    auto input = memory::allocate(engine, { data_types::f32, format::bfzyx, {1,2,2,3,1} });
+
+    set_values(input, {
+        0.0f, 1.0f, 2.0f, 3.0f,
+        4.0f, 5.0f, 6.0f, 7.0f,
+        8.0f, 9.0f, 10.0f, 11.0f
+    });
+
+    topology topology;
+    topology.add(input_layout("Input", input.get_layout()));
+    topology.add(space_to_batch("space_to_batch", "Input", tensor(format::bfzyx, {1,2,1,2,2}, 1),
+                                                           tensor(format::bfzyx, {0,0,0,1,0}, 0),
+                                                           tensor(format::bfzyx, {0,0,0,0,0}, 0),
+                                                           tensor(format::bfzyx, {8,1,1,2,1}, 1)));
+    network network(engine, topology);
+
+    network.set_input_data("Input", input);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("space_to_batch").get_memory();
+    auto output_ptr = output.pointer<float>();
+
+    std::vector<float> expected_results = {
+        0.f, 2.f, 0.f, 3.f, 0.f, 4.f, 1.f, 5.f,
+        0.f, 8.f, 0.f, 9.f, 6.f, 10.f, 7.f, 11.f
+    };
+
+    ASSERT_EQ(output_ptr.size(), expected_results.size());
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_EQ(expected_results[i], output_ptr[i]);
+    }
+}
+
+TEST(space_to_batch_fp32_gpu, i134121_bs142121_pb010100_pe000000) {
+    // Input :       1x3x4x1x2x1
+    // Block shape : 1x4x2x1x2x1
+    // Pads begin :  0x1x0x1x0x0
+    // Pads end :    0x0x0x0x0x0
+    // Output :      16x1x2x2x1x1
+    // Input values in fp32
+
+    engine engine;
+    tensor input_shape = tensor{ batch(1), feature(3), spatial(1, 2, 1, 4) };
+    auto input = memory::allocate(engine, { data_types::f32, format::bfwzyx, input_shape });
+
+    set_values(input, {
+       0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+       6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f,
+       12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f,
+       18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f
+    });
+
+    topology topology;
+    topology.add(input_layout("Input", input.get_layout()));
+    topology.add(space_to_batch("space_to_batch", "Input", tensor(format::bfwzyx, {1,4,2,1,2,1}, 1),
+                                                           tensor(format::bfwzyx, {0,1,0,1,0,0}, 0),
+                                                           tensor(format::bfwzyx, {0,0,0,0,0,0}, 0),
+                                                           tensor(format::bfwzyx, {16,1,2,2,1,1}, 1)));
+    network network(engine, topology);
+
+    network.set_input_data("Input", input);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("space_to_batch").get_memory();
+    auto output_ptr = output.pointer<float>();
+
+    std::vector<float> expected_results = {
+        0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+        0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+        0.f, 0.f, 0.f, 4.f, 0.f, 1.f, 0.f, 5.f,
+        0.f, 2.f, 0.f, 6.f, 0.f, 3.f, 0.f, 7.f,
+        0.f, 8.f, 0.f, 12.f, 0.f, 9.f, 0.f, 13.f,
+        0.f, 10.f, 0.f, 14.f, 0.f, 11.f, 0.f, 15.f,
+        0.f, 16.f, 0.f, 20.f, 0.f, 17.f, 0.f, 21.f,
+        0.f, 18.f, 0.f, 22.f, 0.f, 19.f, 0.f, 23.f
+    };
+
+    ASSERT_EQ(output_ptr.size(), expected_results.size());
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_EQ(expected_results[i], output_ptr[i]);
+    }
+}