From d4b071bd4993a9ce57e56259ba5cafae10f32c86 Mon Sep 17 00:00:00 2001 From: Mingyu Kim Date: Thu, 15 Apr 2021 17:18:34 +0900 Subject: [PATCH] [IE CLDNN] Batched blob support is added for NV12 (#5230) --- .../src/cldnn_engine/cldnn_engine.cpp | 1 + .../src/cldnn_engine/cldnn_infer_request.cpp | 146 ++++++++++------- .../src/cldnn_engine/ops/parameter.cpp | 78 +++++---- .../cldnn_remote_blob_tests.cpp | 148 +++++++++++++++++- .../behavior/set_blob_of_kind.cpp | 5 +- 5 files changed, 286 insertions(+), 92 deletions(-) diff --git a/inference-engine/src/cldnn_engine/cldnn_engine.cpp b/inference-engine/src/cldnn_engine/cldnn_engine.cpp index 01ea25f87ee..c4ea12d757a 100644 --- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp @@ -852,6 +852,7 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map network, } } +void checkInputBlobNV12(const NV12Blob *nv12_ptr) { + auto y_ptr = nv12_ptr->y()->as(); + + // if the blobs are not remote, check their size + if (!y_ptr) { + if (nv12_ptr->y()->buffer() == nullptr) IE_THROW(NotAllocated) << str_not_allocated; + } + + auto uv_ptr = nv12_ptr->uv()->as(); + if (!uv_ptr) { + if (nv12_ptr->uv()->buffer() == nullptr) IE_THROW(NotAllocated) << str_not_allocated; + } +} + +NV12Blob *getNV12BlobOrException(BatchedBlob *batched_ptr, int idx) { + auto nv12_ptr = batched_ptr->getBlob(idx)->as(); + if (nv12_ptr == nullptr) + IE_THROW(NotImplemented) << unsupported_batched_blob; + return nv12_ptr; +} + void checkInputBlob(const Blob::Ptr &blob, const std::string &name, const InputInfo::Ptr foundInput, @@ -334,23 +356,17 @@ void checkInputBlob(const Blob::Ptr &blob, if (ColorFormat::NV12 == foundInput->getPreProcess().getColorFormat() && nv12_two_inputs) { - auto nv12_ptr = blob->as(); - - if (nv12_ptr == nullptr) { + if (auto nv12_ptr = blob->as()) { + checkInputBlobNV12(nv12_ptr); + } else if (auto batched_ptr = blob->as()) { + for (auto i = 0; i < batched_ptr->size(); i++) { + auto nv12_ptr = getNV12BlobOrException(batched_ptr, i); + checkInputBlobNV12(nv12_ptr); + } + } else { IE_THROW(ParameterMismatch) << wrong_nv12_blob; } - auto y_ptr = nv12_ptr->y()->as(); - - // if the blobs are not remote, check their size - if (!y_ptr) { - if (nv12_ptr->y()->buffer() == nullptr) IE_THROW() << str_not_allocated; - } - - auto uv_ptr = nv12_ptr->uv()->as(); - if (!uv_ptr) { - if (nv12_ptr->uv()->buffer() == nullptr) IE_THROW() << str_not_allocated; - } } else { SizeVector dims = foundInput->getTensorDesc().getDims(); @@ -498,27 +514,35 @@ void CLDNNInferRequest::SetBlob(const std::string& name, const Blob::Ptr &data) // and put them into appropriate network inputs // that should then go into biplanar NV12 reorder auto nv12_ptr = data->as(); + auto batched_ptr = data->as(); - if (nv12_ptr == nullptr) { + if (nv12_ptr != nullptr || batched_ptr != nullptr) { + int num_blobs = batched_ptr != nullptr ? batched_ptr->size() : 1; + + for (auto i = 0; i < num_blobs; i++) { + if (batched_ptr != nullptr) + nv12_ptr = getNV12BlobOrException(batched_ptr, i); + + auto y_ptr = nv12_ptr->y()->as(); + if (y_ptr) { + auto y_impl = getBlobImpl(y_ptr); + y_impl->allocate_if_needed(); + input_attach(internalName + "_Y" + std::to_string(i), y_impl->getMemory()); + is_remote = true; + } + + auto uv_ptr = nv12_ptr->uv()->as(); + if (uv_ptr) { + auto uv_impl = getBlobImpl(uv_ptr); + uv_impl->allocate_if_needed(); + input_attach(internalName + "_UV" + std::to_string(i), uv_impl->getMemory()); + is_remote = true; + } + } + } else { IE_THROW(ParameterMismatch) << wrong_nv12_blob; } - auto y_ptr = nv12_ptr->y()->as(); - if (y_ptr) { - auto y_impl = getBlobImpl(y_ptr); - y_impl->allocate_if_needed(); - input_attach(internalName + "_Y", y_impl->getMemory()); - is_remote = true; - } - - auto uv_ptr = nv12_ptr->uv()->as(); - if (uv_ptr) { - auto uv_impl = getBlobImpl(uv_ptr); - uv_impl->allocate_if_needed(); - input_attach(internalName + "_UV", uv_impl->getMemory()); - is_remote = true; - } - if (is_remote) _inputs[name] = data; } } @@ -582,28 +606,33 @@ void CLDNNInferRequest::AllocateInputs() { if (ColorFormat::NV12 == ni.second->getPreProcess().getColorFormat() && m_graph->getConfig().nv12_two_inputs) { - cldnn::primitive_id YName(name + "_Y"); - cldnn::primitive_id UVName(name + "_UV"); + std::vector blobs; + for (auto i = 0; i < desc.getDims()[0]; i++) { + cldnn::primitive_id YName(name + "_Y" + std::to_string(i)); + cldnn::primitive_id UVName(name + "_UV" + std::to_string(i)); - if (inputLayouts.find(YName) == inputLayouts.end()) { - IE_THROW() << "Input layout for " << YName << " is not found"; + if (inputLayouts.find(YName) == inputLayouts.end()) { + IE_THROW(ParameterMismatch) << "Input layout for " << YName << " is not found"; + } + if (inputLayouts.find(UVName) == inputLayouts.end()) { + IE_THROW(ParameterMismatch) << "Input layout for " << YName << " is not found"; + } + input_alloc(YName, inputLayouts.at(YName)); + input_alloc(UVName, inputLayouts.at(UVName)); + + size_t height = desc.getDims()[2], width = desc.getDims()[3]; + cldnn::pointer input_mem_ptr_Y = inputsMemory.at(YName).pointer(); + TensorDesc ydesc(Precision::U8, { 1, 1, height, width }, Layout::NHWC); + auto blobY = createInputBlob(ydesc, input_mem_ptr_Y.data()); + + cldnn::pointer input_mem_ptr_UV = inputsMemory.at(UVName).pointer(); + TensorDesc uvdesc(Precision::U8, { 1, 2, height / 2, width / 2 }, Layout::NHWC); + auto blobUV = createInputBlob(uvdesc, input_mem_ptr_UV.data()); + + blobs.push_back(make_shared_blob(blobY, blobUV)); } - if (inputLayouts.find(UVName) == inputLayouts.end()) { - IE_THROW() << "Input layout for " << UVName << " is not found"; - } - input_alloc(YName, inputLayouts.at(YName)); - input_alloc(UVName, inputLayouts.at(UVName)); + _inputs[name] = desc.getDims()[0] == 1 ? blobs[0] : make_shared_blob(blobs); - size_t height = desc.getDims()[2], width = desc.getDims()[3]; - cldnn::pointer input_mem_ptr_Y = inputsMemory.at(YName).pointer(); - TensorDesc ydesc(Precision::U8, { 1, 1, height, width }, Layout::NHWC); - auto blobY = createInputBlob(ydesc, input_mem_ptr_Y.data()); - - cldnn::pointer input_mem_ptr_UV = inputsMemory.at(UVName).pointer(); - TensorDesc uvdesc(Precision::U8, { 1, 2, height / 2, width / 2 }, Layout::NHWC); - auto blobUV = createInputBlob(uvdesc, input_mem_ptr_UV.data()); - - _inputs[name] = make_shared_blob(blobY, blobUV); } else { if (inputLayouts.find(name) == inputLayouts.end()) { IE_THROW() << "Input layout for " << name << " is not found"; @@ -868,14 +897,21 @@ void CLDNNInferRequest::InferImpl() { PrepareInputDyn(name, *inputBlob); } else { auto nv12_ptr = inputBlob->as(); + auto batched_ptr = inputBlob->as(); - if (nv12_ptr == nullptr) { + if (nv12_ptr != nullptr || batched_ptr != nullptr) { + // special case for NV12 input blob + int num_blobs = batched_ptr != nullptr ? batched_ptr->size() : 1; + for (auto i = 0; i < num_blobs; i++) { + if (batched_ptr != nullptr) + nv12_ptr = getNV12BlobOrException(batched_ptr, i); + + PrepareInput(name + "_Y" + std::to_string(i), *nv12_ptr->y()); + PrepareInput(name + "_UV" + std::to_string(i), *nv12_ptr->uv()); + } + } else { // regular blob PrepareInput(name, *inputBlob); - } else { - // special case for NV12 input blob - PrepareInput(name + "_Y", *nv12_ptr->y()); - PrepareInput(name + "_UV", *nv12_ptr->uv()); } } } diff --git a/inference-engine/src/cldnn_engine/ops/parameter.cpp b/inference-engine/src/cldnn_engine/ops/parameter.cpp index 19286ef9278..df4290291f7 100644 --- a/inference-engine/src/cldnn_engine/ops/parameter.cpp +++ b/inference-engine/src/cldnn_engine/ops/parameter.cpp @@ -10,6 +10,7 @@ #include "api/input_layout.hpp" #include "api/reorder.hpp" #include "api/data.hpp" +#include "api/concatenation.hpp" using namespace InferenceEngine; @@ -97,7 +98,6 @@ void CreateParameterOp(Program& p, const std::shared_ptrget_output_element_type(0)); - auto preprocessPrimID = "reorder:" + inputName + Program::m_preProcessTag; cldnn::primitive_id meanBlobID = inputName + Program::m_meanValuesTag; std::vector meanValues; @@ -184,41 +184,55 @@ void CreateParameterOp(Program& p, const std::shared_ptr reorders; - std::string y_name = inputName + "_Y"; - std::string uv_name = inputName + "_UV"; + for (auto i = 0; i < inputDims[0]; i++) { + auto preprocessPrimID = "reorder:" + inputName + std::to_string(i) + Program::m_preProcessTag; + std::string y_name = inputName + "_Y" + std::to_string(i); + std::string uv_name = inputName + "_UV" + std::to_string(i); - cldnn::layout y_layout(DataTypeFromPrecision(ip), - cldnn::format::nv12, { 1, 1, width, height }); - cldnn::layout uv_layout(DataTypeFromPrecision(ip), - cldnn::format::nv12, { 1, 2, width / 2, height / 2 }); - auto inputY = cldnn::input_layout(y_name, y_layout); - auto inputUV = cldnn::input_layout(uv_name, uv_layout); + cldnn::layout y_layout(DataTypeFromPrecision(ip), + cldnn::format::nv12, { 1, 1, width, height }); + cldnn::layout uv_layout(DataTypeFromPrecision(ip), + cldnn::format::nv12, { 1, 2, width / 2, height / 2 }); + auto inputY = cldnn::input_layout(y_name, y_layout); + auto inputUV = cldnn::input_layout(uv_name, uv_layout); - p.AddPrimitive(inputY); - p.inputLayouts.insert({ inputInfo->name() + "_Y", y_layout }); - p.AddPrimitive(inputUV); - p.inputLayouts.insert({ inputInfo->name() + "_UV", uv_layout }); - switch (preProcess.getMeanVariant()) { - case NONE: - case MEAN_VALUE: { - p.AddPrimitive(cldnn::reorder(preprocessPrimID, y_name, uv_name, networkInputLayout, meanValues)); - break; - } - case MEAN_IMAGE: { - p.AddPrimitive(cldnn::reorder(preprocessPrimID, y_name, uv_name, networkInputLayout, meanBlobID)); - break; - } - default: IE_THROW() << "Invalid mean variant in input " + inputName; - break; + p.AddPrimitive(inputY); + p.inputLayouts.insert({ inputInfo->name() + "_Y" + std::to_string(i), y_layout }); + p.AddPrimitive(inputUV); + p.inputLayouts.insert({ inputInfo->name() + "_UV" + std::to_string(i), uv_layout }); + switch (preProcess.getMeanVariant()) { + case NONE: + case MEAN_VALUE: { + p.AddPrimitive(cldnn::reorder(preprocessPrimID, y_name, uv_name, networkInputLayout, meanValues)); + break; + } + case MEAN_IMAGE: { + p.AddPrimitive(cldnn::reorder(preprocessPrimID, y_name, uv_name, networkInputLayout, meanBlobID)); + break; + } + default: IE_THROW(Unexpected) << "Invalid mean variant in input " + inputName; + break; + } + + p.primitivesToIRLayersMap[preprocessPrimID] = { inputInfo->name() }; + p.primitivesToIRLayersMap[y_name] = { inputInfo->name() }; + p.primitivesToIRLayersMap[uv_name] = { inputInfo->name() }; + p.profilingIDs.push_back(preprocessPrimID); + p.InitProfileInfo(preprocessPrimID, "Reorder"); + p.primitiveIDs[inputName] = preprocessPrimID; // If it is batched blob, it will be overwritten afterwards. + p.primitiveIDs[preprocessPrimID] = preprocessPrimID; + reorders.push_back(preprocessPrimID); } - p.primitivesToIRLayersMap[preprocessPrimID] = { inputInfo->name() }; - p.primitivesToIRLayersMap[y_name] = { inputInfo->name() }; - p.primitivesToIRLayersMap[uv_name] = { inputInfo->name() }; - p.profilingIDs.push_back(preprocessPrimID); - p.InitProfileInfo(preprocessPrimID, "Reorder"); + if (inputDims[0] > 1) { + auto concatPrimID = "concat:" + inputName + Program::m_preProcessTag; + p.AddPrimitive(cldnn::concatenation(concatPrimID, reorders, cldnn::concatenation::along_b)); + p.primitiveIDs[inputName] = concatPrimID; + } } else { + auto preprocessPrimID = "reorder:" + inputName + Program::m_preProcessTag; cldnn::layout inputLayout(networkInputLayout); inputLayout.data_type = DataTypeFromPrecision(ip); p.inputLayouts.insert({ inputInfo->name(), inputLayout }); @@ -244,11 +258,9 @@ void CreateParameterOp(Program& p, const std::shared_ptr { + void SetUp() override { + num_batch = this->GetParam(); + }; +public: + static std::string getTestCaseName(const testing::TestParamInfo &obj) { + return "num_batch_" + std::to_string(obj.param); + } + +protected: + size_t num_batch; + std::vector> fn_ptrs; +}; + +TEST_P(BatchedBlob_Test, canInputNV12) { +#if defined(_WIN32) || defined(ANDROID) + GTEST_SKIP(); +#endif + const int height = 16; + const int width = 16; + + // ------------------------------------------------------ + // Prepare input data + const InferenceEngine::TensorDesc y_plane_desc(InferenceEngine::Precision::U8, {1, 1, height, width}, + InferenceEngine::Layout::NHWC); + const InferenceEngine::TensorDesc uv_plane_desc(InferenceEngine::Precision::U8, {1, 2, height / 2, width / 2}, + InferenceEngine::Layout::NHWC); + std::vector fake_image_data_y; + std::vector fake_image_data_uv; + + for (int i = 0; i < num_batch; i++) { + fake_image_data_y.push_back(FuncTestUtils::createAndFillBlob(y_plane_desc, 50, 0, 1, i)); + fake_image_data_uv.push_back(FuncTestUtils::createAndFillBlob(uv_plane_desc, 256, 0, 1, i)); + } + + auto ie = InferenceEngine::Core(); + + // ------------------------------------------------------ + // inference using remote blob with batch + auto fn_ptr_remote = ngraph::builder::subgraph::makeConvPoolRelu({num_batch, 3, height, width}); + + CNNNetwork net_remote(fn_ptr_remote); + net_remote.getInputsInfo().begin()->second->setLayout(Layout::NCHW); + net_remote.getInputsInfo().begin()->second->setPrecision(Precision::U8); + net_remote.getInputsInfo().begin()->second->getPreProcess().setColorFormat(ColorFormat::NV12); + + /* XXX: is it correct to set KEY_CLDNN_NV12_TWO_INPUTS in case of remote blob? */ + auto exec_net_b = ie.LoadNetwork(net_remote, CommonTestUtils::DEVICE_GPU, + { { CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS, PluginConfigParams::YES} }); + auto inf_req_remote = exec_net_b.CreateInferRequest(); + auto cldnn_context = exec_net_b.GetContext(); + cl_context ctx = std::dynamic_pointer_cast(cldnn_context)->get(); + auto ocl_instance = std::make_shared(ctx); + cl_int err; + + std::vector nv12_image_plane_y, nv12_image_plane_uv; + std::vector img_y, img_uv; + std::vector blob_remote; + + for (int i = 0; i < num_batch; i++) { + cl_image_format image_format; + cl_image_desc image_desc = { 0 }; + image_format.image_channel_order = CL_R; + image_format.image_channel_data_type = CL_UNORM_INT8; + image_desc.image_type = CL_MEM_OBJECT_IMAGE2D; + image_desc.image_width = width; + image_desc.image_height = height; + + nv12_image_plane_y.push_back(clCreateImage(ocl_instance->_context.get(), CL_MEM_READ_WRITE, &image_format, &image_desc, NULL, &err)); + ASSERT_EQ(err, 0); + + image_format.image_channel_order = CL_RG; + image_desc.image_width = width / 2; + image_desc.image_height = height / 2; + + nv12_image_plane_uv.push_back(clCreateImage(ocl_instance->_context.get(), CL_MEM_READ_WRITE, &image_format, &image_desc, NULL, &err)); + ASSERT_EQ(err, 0); + + size_t origin[3] = { 0, 0, 0 }; + size_t y_region[3] = { (size_t)width, (size_t)height, 1 }; + size_t uv_region[3] = { (size_t)width / 2, (size_t)height / 2, 1 }; + + err = clEnqueueWriteImage(ocl_instance->_queue.get(), nv12_image_plane_y[i], + true, origin, y_region, 0, 0, fake_image_data_y[i]->buffer(), 0, NULL, NULL); + ASSERT_EQ(err, 0); + + err = clEnqueueWriteImage(ocl_instance->_queue.get(), nv12_image_plane_uv[i], + true, origin, uv_region, 0, 0, fake_image_data_uv[i]->buffer(), 0, NULL, NULL); + ASSERT_EQ(err, 0); + + img_y.push_back(cl::Image2D(nv12_image_plane_y[i])); + img_uv.push_back(cl::Image2D(nv12_image_plane_uv[i])); + + blob_remote.push_back(make_shared_blob_nv12(cldnn_context, img_y[i], img_uv[i])); + } + + if (num_batch == 1) { + inf_req_remote.SetBlob(net_remote.getInputsInfo().begin()->first, blob_remote[0]); + } else { + auto batched_blob = make_shared_blob(blob_remote); + inf_req_remote.SetBlob(net_remote.getInputsInfo().begin()->first, batched_blob); + } + + inf_req_remote.Infer(); + + auto outputBlob_shared = inf_req_remote.GetBlob(net_remote.getOutputsInfo().begin()->first); + + // ------------------------------------------------------ + // Setup to inference using local blob with batch=1 + auto fn_ptr_local = ngraph::builder::subgraph::makeConvPoolRelu({1, 3, height, width}); + + CNNNetwork net_local(fn_ptr_local); + + net_local.getInputsInfo().begin()->second->setLayout(Layout::NCHW); + net_local.getInputsInfo().begin()->second->setPrecision(Precision::U8); + net_local.getInputsInfo().begin()->second->getPreProcess().setColorFormat(ColorFormat::NV12); + + auto exec_net_b1 = ie.LoadNetwork(net_local, CommonTestUtils::DEVICE_GPU); + + auto inf_req_local = exec_net_b1.CreateInferRequest(); + + // Run regular input for each image and compare against batched blob + for (int i = 0; i < num_batch; i++) { + auto y_blob = make_shared_blob(y_plane_desc, fake_image_data_y[i]->buffer().as()); + auto uv_blob = make_shared_blob(uv_plane_desc, fake_image_data_uv[i]->buffer().as()); + auto blob = make_shared_blob(y_blob, uv_blob); + inf_req_local.SetBlob(net_local.getInputsInfo().begin()->first, blob); + inf_req_local.Infer(); + auto output_blob_local = inf_req_local.GetBlob(net_local.getOutputsInfo().begin()->first); + + // This network generates [1, size] tensor whether batch=1 or 2. So need to split + auto split_shared_blob = make_shared_blob(output_blob_local->getTensorDesc(), + outputBlob_shared->buffer().as() + output_blob_local->size() * i); + ASSERT_EQ(output_blob_local->size(), split_shared_blob->size()); + float thr = 0.1; + + FuncTestUtils::compareBlobs(output_blob_local, split_shared_blob, thr, "", false); + } +} + +const std::vector num_batches{1, 2, 4}; + +INSTANTIATE_TEST_CASE_P(smoke_RemoteBlob, BatchedBlob_Test, ::testing::ValuesIn(num_batches), BatchedBlob_Test::getTestCaseName); + class TwoNets_Test : public CommonTestUtils::TestsCommon, public testing::WithParamInterface { void SetUp() override { num_streams = this->GetParam(); @@ -211,6 +355,6 @@ TEST_P(TwoNets_Test, canInferTwoExecNets) { } } -const std::vector num_strems{1, 2}; +const std::vector num_streams{1, 2}; -INSTANTIATE_TEST_CASE_P(smoke_RemoteBlob, TwoNets_Test, ::testing::ValuesIn(num_strems), TwoNets_Test::getTestCaseName); +INSTANTIATE_TEST_CASE_P(smoke_RemoteBlob, TwoNets_Test, ::testing::ValuesIn(num_streams), TwoNets_Test::getTestCaseName); diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/set_blob_of_kind.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/set_blob_of_kind.cpp index 00988bdc510..936abfe826d 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/set_blob_of_kind.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/set_blob_of_kind.cpp @@ -10,8 +10,9 @@ using namespace InferenceEngine; const std::vector blobKinds = { FuncTestUtils::BlobKind::Simple, - FuncTestUtils::BlobKind::Compound, - FuncTestUtils::BlobKind::BatchOfSimple + FuncTestUtils::BlobKind::Compound + /* BatchOfSimple is not supported on GPU currently. Batch of remote is supported */ + /* , FuncTestUtils::BlobKind::BatchOfSimple */ }; const SetBlobOfKindConfig gpuConfig{}; //nothing special