From d4b071bd4993a9ce57e56259ba5cafae10f32c86 Mon Sep 17 00:00:00 2001
From: Mingyu Kim <mingyu.kim@intel.com>
Date: Thu, 15 Apr 2021 17:18:34 +0900
Subject: [PATCH] [IE CLDNN] Batched blob support is added for NV12 (#5230)

---
 .../src/cldnn_engine/cldnn_engine.cpp         |   1 +
 .../src/cldnn_engine/cldnn_infer_request.cpp  | 146 ++++++++++-------
 .../src/cldnn_engine/ops/parameter.cpp        |  78 +++++----
 .../cldnn_remote_blob_tests.cpp               | 148 +++++++++++++++++-
 .../behavior/set_blob_of_kind.cpp             |   5 +-
 5 files changed, 286 insertions(+), 92 deletions(-)
diff --git a/inference-engine/src/cldnn_engine/cldnn_engine.cpp b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
index 01ea25f87ee..c4ea12d757a 100644
--- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
@@ -852,6 +852,7 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map<std::st
 
         capabilities.push_back(METRIC_VALUE(FP32));
         capabilities.push_back(METRIC_VALUE(BIN));
+        capabilities.push_back(METRIC_VALUE(BATCHED_BLOB));
         if (device_info.supports_fp16)
             capabilities.push_back(METRIC_VALUE(FP16));
         if (device_info.supports_imad || device_info.supports_immad)
diff --git a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
index a14b2566c73..e08c3e43713 100644
--- a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
@@ -21,6 +21,7 @@ const char CLDNNInferRequest::fp32_suffix[] = "_fp32";
 const char str_not_allocated[] = "Input data was not allocated.";
 const char cannot_set_compound[] = "cannot set compound blob: supported only for input pre-processing";
 const char wrong_nv12_blob[] = "NV12 input blob is expected for input with NV12 color format";
+const char unsupported_batched_blob[] = "Batched input blob is expected to contain nv12 blobs";
 
 Blob::Ptr CLDNNInferRequest::createInputBlob(const TensorDesc& desc, uint8_t* mem_ptr) {
     OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::createInputBlob");
@@ -322,6 +323,27 @@ void CLDNNInferRequest::copyInputData(std::shared_ptr<cldnn::network> network,
     }
 }
 
+void checkInputBlobNV12(const NV12Blob *nv12_ptr) {
+    auto y_ptr = nv12_ptr->y()->as<gpu::ClBlob>();
+
+    // if the blobs are not remote, check their size
+    if (!y_ptr) {
+        if (nv12_ptr->y()->buffer() == nullptr) IE_THROW(NotAllocated) << str_not_allocated;
+    }
+
+    auto uv_ptr = nv12_ptr->uv()->as<gpu::ClBlob>();
+    if (!uv_ptr) {
+        if (nv12_ptr->uv()->buffer() == nullptr) IE_THROW(NotAllocated) << str_not_allocated;
+    }
+}
+
+NV12Blob *getNV12BlobOrException(BatchedBlob *batched_ptr, int idx) {
+    auto nv12_ptr = batched_ptr->getBlob(idx)->as<NV12Blob>();
+    if (nv12_ptr == nullptr)
+        IE_THROW(NotImplemented) << unsupported_batched_blob;
+    return nv12_ptr;
+}
+
 void checkInputBlob(const Blob::Ptr &blob,
     const std::string &name,
     const InputInfo::Ptr foundInput,
@@ -334,23 +356,17 @@ void checkInputBlob(const Blob::Ptr &blob,
 
     if (ColorFormat::NV12 == foundInput->getPreProcess().getColorFormat() &&
         nv12_two_inputs) {
-        auto nv12_ptr = blob->as<NV12Blob>();
-
-        if (nv12_ptr == nullptr) {
+        if (auto nv12_ptr = blob->as<NV12Blob>()) {
+            checkInputBlobNV12(nv12_ptr);
+        } else if (auto batched_ptr = blob->as<BatchedBlob>()) {
+            for (auto i = 0; i < batched_ptr->size(); i++) {
+                auto nv12_ptr = getNV12BlobOrException(batched_ptr, i);
+                checkInputBlobNV12(nv12_ptr);
+            }
+        } else {
             IE_THROW(ParameterMismatch) << wrong_nv12_blob;
         }
 
-        auto y_ptr = nv12_ptr->y()->as<gpu::ClBlob>();
-
-        // if the blobs are not remote, check their size
-        if (!y_ptr) {
-            if (nv12_ptr->y()->buffer() == nullptr) IE_THROW() << str_not_allocated;
-        }
-
-        auto uv_ptr = nv12_ptr->uv()->as<gpu::ClBlob>();
-        if (!uv_ptr) {
-            if (nv12_ptr->uv()->buffer() == nullptr) IE_THROW() << str_not_allocated;
-        }
     } else {
         SizeVector dims = foundInput->getTensorDesc().getDims();
 
@@ -498,27 +514,35 @@ void CLDNNInferRequest::SetBlob(const std::string& name, const Blob::Ptr &data)
                 // and put them into appropriate network inputs
                 // that should then go into biplanar NV12 reorder
                 auto nv12_ptr = data->as<NV12Blob>();
+                auto batched_ptr = data->as<BatchedBlob>();
 
-                if (nv12_ptr == nullptr) {
+                if (nv12_ptr != nullptr || batched_ptr != nullptr) {
+                    int num_blobs = batched_ptr != nullptr ? batched_ptr->size() : 1;
+
+                    for (auto i = 0; i < num_blobs; i++) {
+                        if (batched_ptr != nullptr)
+                            nv12_ptr = getNV12BlobOrException(batched_ptr, i);
+
+                        auto y_ptr = nv12_ptr->y()->as<gpu::ClBlob>();
+                        if (y_ptr) {
+                            auto y_impl = getBlobImpl(y_ptr);
+                            y_impl->allocate_if_needed();
+                            input_attach(internalName + "_Y" + std::to_string(i), y_impl->getMemory());
+                            is_remote = true;
+                        }
+
+                        auto uv_ptr = nv12_ptr->uv()->as<gpu::ClBlob>();
+                        if (uv_ptr) {
+                            auto uv_impl = getBlobImpl(uv_ptr);
+                            uv_impl->allocate_if_needed();
+                            input_attach(internalName + "_UV" + std::to_string(i), uv_impl->getMemory());
+                            is_remote = true;
+                        }
+                    }
+                } else {
                     IE_THROW(ParameterMismatch) << wrong_nv12_blob;
                 }
 
-                auto y_ptr = nv12_ptr->y()->as<gpu::ClBlob>();
-                if (y_ptr) {
-                    auto y_impl = getBlobImpl(y_ptr);
-                    y_impl->allocate_if_needed();
-                    input_attach(internalName + "_Y", y_impl->getMemory());
-                    is_remote = true;
-                }
-
-                auto uv_ptr = nv12_ptr->uv()->as<gpu::ClBlob>();
-                if (uv_ptr) {
-                    auto uv_impl = getBlobImpl(uv_ptr);
-                    uv_impl->allocate_if_needed();
-                    input_attach(internalName + "_UV", uv_impl->getMemory());
-                    is_remote = true;
-                }
-
                 if (is_remote) _inputs[name] = data;
             }
         }
@@ -582,28 +606,33 @@ void CLDNNInferRequest::AllocateInputs() {
 
         if (ColorFormat::NV12 == ni.second->getPreProcess().getColorFormat() &&
             m_graph->getConfig().nv12_two_inputs) {
-            cldnn::primitive_id YName(name + "_Y");
-            cldnn::primitive_id UVName(name + "_UV");
+            std::vector<Blob::Ptr> blobs;
+            for (auto i = 0; i < desc.getDims()[0]; i++) {
+                cldnn::primitive_id YName(name + "_Y" + std::to_string(i));
+                cldnn::primitive_id UVName(name + "_UV" + std::to_string(i));
 
-            if (inputLayouts.find(YName) == inputLayouts.end()) {
-                IE_THROW() << "Input layout for " << YName << " is not found";
+                if (inputLayouts.find(YName) == inputLayouts.end()) {
+                    IE_THROW(ParameterMismatch) << "Input layout for " << YName << " is not found";
+                }
+                if (inputLayouts.find(UVName) == inputLayouts.end()) {
+                    IE_THROW(ParameterMismatch) << "Input layout for " << YName << " is not found";
+                }
+                input_alloc(YName, inputLayouts.at(YName));
+                input_alloc(UVName, inputLayouts.at(UVName));
+
+                size_t height = desc.getDims()[2], width = desc.getDims()[3];
+                cldnn::pointer<uint8_t> input_mem_ptr_Y = inputsMemory.at(YName).pointer<uint8_t>();
+                TensorDesc ydesc(Precision::U8, { 1, 1, height, width }, Layout::NHWC);
+                auto blobY = createInputBlob(ydesc, input_mem_ptr_Y.data());
+
+                cldnn::pointer<uint8_t> input_mem_ptr_UV = inputsMemory.at(UVName).pointer<uint8_t>();
+                TensorDesc uvdesc(Precision::U8, { 1, 2, height / 2, width / 2 }, Layout::NHWC);
+                auto blobUV = createInputBlob(uvdesc, input_mem_ptr_UV.data());
+
+                blobs.push_back(make_shared_blob<NV12Blob>(blobY, blobUV));
             }
-            if (inputLayouts.find(UVName) == inputLayouts.end()) {
-                IE_THROW() << "Input layout for " << UVName << " is not found";
-            }
-            input_alloc(YName, inputLayouts.at(YName));
-            input_alloc(UVName, inputLayouts.at(UVName));
+            _inputs[name] = desc.getDims()[0] == 1 ? blobs[0] : make_shared_blob<BatchedBlob>(blobs);
 
-            size_t height = desc.getDims()[2], width = desc.getDims()[3];
-            cldnn::pointer<uint8_t> input_mem_ptr_Y = inputsMemory.at(YName).pointer<uint8_t>();
-            TensorDesc ydesc(Precision::U8, { 1, 1, height, width }, Layout::NHWC);
-            auto blobY = createInputBlob(ydesc, input_mem_ptr_Y.data());
-
-            cldnn::pointer<uint8_t> input_mem_ptr_UV = inputsMemory.at(UVName).pointer<uint8_t>();
-            TensorDesc uvdesc(Precision::U8, { 1, 2, height / 2, width / 2 }, Layout::NHWC);
-            auto blobUV = createInputBlob(uvdesc, input_mem_ptr_UV.data());
-
-            _inputs[name] = make_shared_blob<NV12Blob>(blobY, blobUV);
         } else {
             if (inputLayouts.find(name) == inputLayouts.end()) {
                 IE_THROW() << "Input layout for " << name << " is not found";
@@ -868,14 +897,21 @@ void CLDNNInferRequest::InferImpl() {
             PrepareInputDyn(name, *inputBlob);
         } else {
             auto nv12_ptr = inputBlob->as<NV12Blob>();
+            auto batched_ptr = inputBlob->as<BatchedBlob>();
 
-            if (nv12_ptr == nullptr) {
+            if (nv12_ptr != nullptr || batched_ptr != nullptr) {
+                // special case for NV12 input blob
+                int num_blobs = batched_ptr != nullptr ? batched_ptr->size() : 1;
+                for (auto i = 0; i < num_blobs; i++) {
+                    if (batched_ptr != nullptr)
+                        nv12_ptr = getNV12BlobOrException(batched_ptr, i);
+
+                    PrepareInput(name + "_Y" + std::to_string(i), *nv12_ptr->y());
+                    PrepareInput(name + "_UV" + std::to_string(i), *nv12_ptr->uv());
+                }
+            } else {
                 // regular blob
                 PrepareInput(name, *inputBlob);
-            } else {
-                // special case for NV12 input blob
-                PrepareInput(name + "_Y", *nv12_ptr->y());
-                PrepareInput(name + "_UV", *nv12_ptr->uv());
             }
         }
     }
diff --git a/inference-engine/src/cldnn_engine/ops/parameter.cpp b/inference-engine/src/cldnn_engine/ops/parameter.cpp
index 19286ef9278..df4290291f7 100644
--- a/inference-engine/src/cldnn_engine/ops/parameter.cpp
+++ b/inference-engine/src/cldnn_engine/ops/parameter.cpp
@@ -10,6 +10,7 @@
 #include "api/input_layout.hpp"
 #include "api/reorder.hpp"
 #include "api/data.hpp"
+#include "api/concatenation.hpp"
 
 using namespace InferenceEngine;
 
@@ -97,7 +98,6 @@ void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::Paramet
     networkInputLayout.format = inputFormat;
     networkInputLayout.size = networkInputLayout.size.transform(inputFormat, 1);
     networkInputLayout.data_type = DataTypeFromPrecision(op->get_output_element_type(0));
-    auto preprocessPrimID = "reorder:" + inputName + Program::m_preProcessTag;
     cldnn::primitive_id meanBlobID = inputName + Program::m_meanValuesTag;
     std::vector<float> meanValues;
 
@@ -184,41 +184,55 @@ void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::Paramet
         }
         int height = inputDims[2];
         int width = inputDims[3];
+        std::vector<cldnn::primitive_id> reorders;
 
-        std::string y_name = inputName + "_Y";
-        std::string uv_name = inputName + "_UV";
+        for (auto i = 0; i < inputDims[0]; i++) {
+            auto preprocessPrimID = "reorder:" + inputName + std::to_string(i) + Program::m_preProcessTag;
+            std::string y_name = inputName + "_Y" + std::to_string(i);
+            std::string uv_name = inputName + "_UV" + std::to_string(i);
 
-        cldnn::layout y_layout(DataTypeFromPrecision(ip),
-                                cldnn::format::nv12, { 1, 1, width, height });
-        cldnn::layout uv_layout(DataTypeFromPrecision(ip),
-                                cldnn::format::nv12, { 1, 2, width / 2, height / 2 });
-        auto inputY = cldnn::input_layout(y_name, y_layout);
-        auto inputUV = cldnn::input_layout(uv_name, uv_layout);
+            cldnn::layout y_layout(DataTypeFromPrecision(ip),
+                                    cldnn::format::nv12, { 1, 1, width, height });
+            cldnn::layout uv_layout(DataTypeFromPrecision(ip),
+                                    cldnn::format::nv12, { 1, 2, width / 2, height / 2 });
+            auto inputY = cldnn::input_layout(y_name, y_layout);
+            auto inputUV = cldnn::input_layout(uv_name, uv_layout);
 
-        p.AddPrimitive(inputY);
-        p.inputLayouts.insert({ inputInfo->name() + "_Y", y_layout });
-        p.AddPrimitive(inputUV);
-        p.inputLayouts.insert({ inputInfo->name() + "_UV", uv_layout });
-        switch (preProcess.getMeanVariant()) {
-        case NONE:
-        case MEAN_VALUE: {
-            p.AddPrimitive(cldnn::reorder(preprocessPrimID, y_name, uv_name, networkInputLayout, meanValues));
-            break;
-        }
-        case MEAN_IMAGE: {
-            p.AddPrimitive(cldnn::reorder(preprocessPrimID, y_name, uv_name, networkInputLayout, meanBlobID));
-            break;
-        }
-        default: IE_THROW() << "Invalid mean variant in input " + inputName;
-            break;
+            p.AddPrimitive(inputY);
+            p.inputLayouts.insert({ inputInfo->name() + "_Y" + std::to_string(i), y_layout });
+            p.AddPrimitive(inputUV);
+            p.inputLayouts.insert({ inputInfo->name() + "_UV" + std::to_string(i), uv_layout });
+            switch (preProcess.getMeanVariant()) {
+            case NONE:
+            case MEAN_VALUE: {
+                p.AddPrimitive(cldnn::reorder(preprocessPrimID, y_name, uv_name, networkInputLayout, meanValues));
+                break;
+            }
+            case MEAN_IMAGE: {
+                p.AddPrimitive(cldnn::reorder(preprocessPrimID, y_name, uv_name, networkInputLayout, meanBlobID));
+                break;
+            }
+            default: IE_THROW(Unexpected) << "Invalid mean variant in input " + inputName;
+                break;
+            }
+
+            p.primitivesToIRLayersMap[preprocessPrimID] = { inputInfo->name() };
+            p.primitivesToIRLayersMap[y_name] = { inputInfo->name() };
+            p.primitivesToIRLayersMap[uv_name] = { inputInfo->name() };
+            p.profilingIDs.push_back(preprocessPrimID);
+            p.InitProfileInfo(preprocessPrimID, "Reorder");
+            p.primitiveIDs[inputName] = preprocessPrimID;  // If it is batched blob, it will be overwritten afterwards.
+            p.primitiveIDs[preprocessPrimID] = preprocessPrimID;
+            reorders.push_back(preprocessPrimID);
         }
 
-        p.primitivesToIRLayersMap[preprocessPrimID] = { inputInfo->name() };
-        p.primitivesToIRLayersMap[y_name] = { inputInfo->name() };
-        p.primitivesToIRLayersMap[uv_name] = { inputInfo->name() };
-        p.profilingIDs.push_back(preprocessPrimID);
-        p.InitProfileInfo(preprocessPrimID, "Reorder");
+        if (inputDims[0] > 1) {
+            auto concatPrimID = "concat:" + inputName + Program::m_preProcessTag;
+            p.AddPrimitive(cldnn::concatenation(concatPrimID, reorders, cldnn::concatenation::along_b));
+            p.primitiveIDs[inputName] = concatPrimID;
+        }
     } else {
+        auto preprocessPrimID = "reorder:" + inputName + Program::m_preProcessTag;
         cldnn::layout inputLayout(networkInputLayout);
         inputLayout.data_type = DataTypeFromPrecision(ip);
         p.inputLayouts.insert({ inputInfo->name(), inputLayout });
@@ -244,11 +258,9 @@ void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::Paramet
         }
         p.InitProfileInfo(preprocessPrimID, "reorder");
         p.primitiveIDs[preprocessPrimID] = preprocessPrimID;
+        p.primitiveIDs[inputName] = preprocessPrimID;
         p.profilingIDs.push_back(preprocessPrimID);
     }
-
-    p.primitiveIDs[inputName] = preprocessPrimID;
-    p.primitiveIDs[preprocessPrimID] = preprocessPrimID;
 }
 
 REGISTER_FACTORY_IMPL(v0, Parameter);
diff --git a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
index a842756267f..3a664878ae6 100644
--- a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
@@ -127,6 +127,150 @@ TEST_F(RemoteBlob_Test, smoke_canInferOnUserContext) {
     }
 }
 
+class BatchedBlob_Test : public CommonTestUtils::TestsCommon, public testing::WithParamInterface<size_t> {
+    void SetUp() override {
+        num_batch = this->GetParam();
+    };
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<std::size_t> &obj) {
+        return "num_batch_" + std::to_string(obj.param);
+    }
+
+protected:
+    size_t num_batch;
+    std::vector<std::shared_ptr<ngraph::Function>> fn_ptrs;
+};
+
+TEST_P(BatchedBlob_Test, canInputNV12) {
+#if defined(_WIN32) || defined(ANDROID)
+    GTEST_SKIP();
+#endif
+    const int height = 16;
+    const int width = 16;
+
+    // ------------------------------------------------------
+    // Prepare input data
+    const InferenceEngine::TensorDesc y_plane_desc(InferenceEngine::Precision::U8, {1, 1, height, width},
+        InferenceEngine::Layout::NHWC);
+    const InferenceEngine::TensorDesc uv_plane_desc(InferenceEngine::Precision::U8, {1, 2, height / 2, width / 2},
+        InferenceEngine::Layout::NHWC);
+    std::vector<InferenceEngine::Blob::Ptr> fake_image_data_y;
+    std::vector<InferenceEngine::Blob::Ptr> fake_image_data_uv;
+
+    for (int i = 0; i < num_batch; i++) {
+        fake_image_data_y.push_back(FuncTestUtils::createAndFillBlob(y_plane_desc, 50, 0, 1, i));
+        fake_image_data_uv.push_back(FuncTestUtils::createAndFillBlob(uv_plane_desc, 256, 0, 1, i));
+    }
+
+    auto ie = InferenceEngine::Core();
+
+    // ------------------------------------------------------
+    // inference using remote blob with batch
+    auto fn_ptr_remote = ngraph::builder::subgraph::makeConvPoolRelu({num_batch, 3, height, width});
+
+    CNNNetwork net_remote(fn_ptr_remote);
+    net_remote.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
+    net_remote.getInputsInfo().begin()->second->setPrecision(Precision::U8);
+    net_remote.getInputsInfo().begin()->second->getPreProcess().setColorFormat(ColorFormat::NV12);
+
+    /* XXX: is it correct to set KEY_CLDNN_NV12_TWO_INPUTS in case of remote blob? */
+    auto exec_net_b = ie.LoadNetwork(net_remote, CommonTestUtils::DEVICE_GPU,
+                { { CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS, PluginConfigParams::YES} });
+    auto inf_req_remote = exec_net_b.CreateInferRequest();
+    auto cldnn_context = exec_net_b.GetContext();
+    cl_context ctx = std::dynamic_pointer_cast<ClContext>(cldnn_context)->get();
+    auto ocl_instance = std::make_shared<OpenCL>(ctx);
+    cl_int err;
+
+    std::vector<cl_mem> nv12_image_plane_y, nv12_image_plane_uv;
+    std::vector<cl::Image2D> img_y, img_uv;
+    std::vector<Blob::Ptr> blob_remote;
+
+    for (int i = 0; i < num_batch; i++) {
+        cl_image_format image_format;
+        cl_image_desc image_desc = { 0 };
+        image_format.image_channel_order = CL_R;
+        image_format.image_channel_data_type = CL_UNORM_INT8;
+        image_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+        image_desc.image_width = width;
+        image_desc.image_height = height;
+
+        nv12_image_plane_y.push_back(clCreateImage(ocl_instance->_context.get(), CL_MEM_READ_WRITE, &image_format, &image_desc, NULL, &err));
+        ASSERT_EQ(err, 0);
+
+        image_format.image_channel_order = CL_RG;
+        image_desc.image_width = width / 2;
+        image_desc.image_height = height / 2;
+
+        nv12_image_plane_uv.push_back(clCreateImage(ocl_instance->_context.get(), CL_MEM_READ_WRITE, &image_format, &image_desc, NULL, &err));
+        ASSERT_EQ(err, 0);
+
+        size_t origin[3] = { 0, 0, 0 };
+        size_t y_region[3] = { (size_t)width, (size_t)height, 1 };
+        size_t uv_region[3] = { (size_t)width / 2, (size_t)height / 2, 1 };
+
+        err = clEnqueueWriteImage(ocl_instance->_queue.get(), nv12_image_plane_y[i],
+            true, origin, y_region, 0, 0, fake_image_data_y[i]->buffer(), 0, NULL, NULL);
+        ASSERT_EQ(err, 0);
+
+        err = clEnqueueWriteImage(ocl_instance->_queue.get(), nv12_image_plane_uv[i],
+            true, origin, uv_region, 0, 0, fake_image_data_uv[i]->buffer(), 0, NULL, NULL);
+        ASSERT_EQ(err, 0);
+
+        img_y.push_back(cl::Image2D(nv12_image_plane_y[i]));
+        img_uv.push_back(cl::Image2D(nv12_image_plane_uv[i]));
+
+        blob_remote.push_back(make_shared_blob_nv12(cldnn_context, img_y[i], img_uv[i]));
+    }
+
+    if (num_batch == 1) {
+        inf_req_remote.SetBlob(net_remote.getInputsInfo().begin()->first, blob_remote[0]);
+    } else {
+        auto batched_blob = make_shared_blob<BatchedBlob>(blob_remote);
+        inf_req_remote.SetBlob(net_remote.getInputsInfo().begin()->first, batched_blob);
+    }
+
+    inf_req_remote.Infer();
+
+    auto outputBlob_shared = inf_req_remote.GetBlob(net_remote.getOutputsInfo().begin()->first);
+
+    // ------------------------------------------------------
+    // Setup to inference using local blob with batch=1
+    auto fn_ptr_local = ngraph::builder::subgraph::makeConvPoolRelu({1, 3, height, width});
+
+    CNNNetwork net_local(fn_ptr_local);
+
+    net_local.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
+    net_local.getInputsInfo().begin()->second->setPrecision(Precision::U8);
+    net_local.getInputsInfo().begin()->second->getPreProcess().setColorFormat(ColorFormat::NV12);
+
+    auto exec_net_b1 = ie.LoadNetwork(net_local, CommonTestUtils::DEVICE_GPU);
+
+    auto inf_req_local = exec_net_b1.CreateInferRequest();
+
+    // Run regular input for each image and compare against batched blob
+    for (int i = 0; i < num_batch; i++) {
+        auto y_blob = make_shared_blob<uint8_t>(y_plane_desc, fake_image_data_y[i]->buffer().as<uint8_t *>());
+        auto uv_blob = make_shared_blob<uint8_t>(uv_plane_desc, fake_image_data_uv[i]->buffer().as<uint8_t *>());
+        auto blob = make_shared_blob<NV12Blob>(y_blob, uv_blob);
+        inf_req_local.SetBlob(net_local.getInputsInfo().begin()->first, blob);
+        inf_req_local.Infer();
+        auto output_blob_local = inf_req_local.GetBlob(net_local.getOutputsInfo().begin()->first);
+
+        // This network generates [1, size] tensor whether batch=1 or 2. So need to split
+        auto split_shared_blob = make_shared_blob<float_t>(output_blob_local->getTensorDesc(),
+                                    outputBlob_shared->buffer().as<float_t *>() + output_blob_local->size() * i);
+        ASSERT_EQ(output_blob_local->size(), split_shared_blob->size());
+        float thr = 0.1;
+
+        FuncTestUtils::compareBlobs(output_blob_local, split_shared_blob, thr, "", false);
+    }
+}
+
+const std::vector<size_t> num_batches{1, 2, 4};
+
+INSTANTIATE_TEST_CASE_P(smoke_RemoteBlob, BatchedBlob_Test, ::testing::ValuesIn(num_batches), BatchedBlob_Test::getTestCaseName);
+
 class TwoNets_Test : public CommonTestUtils::TestsCommon, public testing::WithParamInterface<size_t> {
     void SetUp() override {
         num_streams = this->GetParam();
@@ -211,6 +355,6 @@ TEST_P(TwoNets_Test, canInferTwoExecNets) {
     }
 }
 
-const std::vector<size_t> num_strems{1, 2};
+const std::vector<size_t> num_streams{1, 2};
 
-INSTANTIATE_TEST_CASE_P(smoke_RemoteBlob, TwoNets_Test, ::testing::ValuesIn(num_strems), TwoNets_Test::getTestCaseName);
+INSTANTIATE_TEST_CASE_P(smoke_RemoteBlob, TwoNets_Test, ::testing::ValuesIn(num_streams), TwoNets_Test::getTestCaseName);
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/set_blob_of_kind.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/set_blob_of_kind.cpp
index 00988bdc510..936abfe826d 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/set_blob_of_kind.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/set_blob_of_kind.cpp
@@ -10,8 +10,9 @@ using namespace InferenceEngine;
 
 const std::vector<FuncTestUtils::BlobKind> blobKinds = {
     FuncTestUtils::BlobKind::Simple,
-    FuncTestUtils::BlobKind::Compound,
-    FuncTestUtils::BlobKind::BatchOfSimple
+    FuncTestUtils::BlobKind::Compound
+    /* BatchOfSimple is not supported on GPU currently. Batch of remote is supported */
+    /* , FuncTestUtils::BlobKind::BatchOfSimple */
 };
 
 const SetBlobOfKindConfig gpuConfig{}; //nothing special