[IE CLDNN] Batched blob support is added for NV12 (#5230)

This commit is contained in:
Mingyu Kim 2021-04-15 17:18:34 +09:00 committed by GitHub
parent 887c8c46cc
commit d4b071bd49
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 286 additions and 92 deletions

View File

@ -852,6 +852,7 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map<std::st
capabilities.push_back(METRIC_VALUE(FP32));
capabilities.push_back(METRIC_VALUE(BIN));
capabilities.push_back(METRIC_VALUE(BATCHED_BLOB));
if (device_info.supports_fp16)
capabilities.push_back(METRIC_VALUE(FP16));
if (device_info.supports_imad || device_info.supports_immad)

View File

@ -21,6 +21,7 @@ const char CLDNNInferRequest::fp32_suffix[] = "_fp32";
const char str_not_allocated[] = "Input data was not allocated.";
const char cannot_set_compound[] = "cannot set compound blob: supported only for input pre-processing";
const char wrong_nv12_blob[] = "NV12 input blob is expected for input with NV12 color format";
const char unsupported_batched_blob[] = "Batched input blob is expected to contain nv12 blobs";
Blob::Ptr CLDNNInferRequest::createInputBlob(const TensorDesc& desc, uint8_t* mem_ptr) {
OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::createInputBlob");
@ -322,6 +323,27 @@ void CLDNNInferRequest::copyInputData(std::shared_ptr<cldnn::network> network,
}
}
void checkInputBlobNV12(const NV12Blob *nv12_ptr) {
auto y_ptr = nv12_ptr->y()->as<gpu::ClBlob>();
// if the blobs are not remote, check their size
if (!y_ptr) {
if (nv12_ptr->y()->buffer() == nullptr) IE_THROW(NotAllocated) << str_not_allocated;
}
auto uv_ptr = nv12_ptr->uv()->as<gpu::ClBlob>();
if (!uv_ptr) {
if (nv12_ptr->uv()->buffer() == nullptr) IE_THROW(NotAllocated) << str_not_allocated;
}
}
NV12Blob *getNV12BlobOrException(BatchedBlob *batched_ptr, int idx) {
auto nv12_ptr = batched_ptr->getBlob(idx)->as<NV12Blob>();
if (nv12_ptr == nullptr)
IE_THROW(NotImplemented) << unsupported_batched_blob;
return nv12_ptr;
}
void checkInputBlob(const Blob::Ptr &blob,
const std::string &name,
const InputInfo::Ptr foundInput,
@ -334,23 +356,17 @@ void checkInputBlob(const Blob::Ptr &blob,
if (ColorFormat::NV12 == foundInput->getPreProcess().getColorFormat() &&
nv12_two_inputs) {
auto nv12_ptr = blob->as<NV12Blob>();
if (nv12_ptr == nullptr) {
if (auto nv12_ptr = blob->as<NV12Blob>()) {
checkInputBlobNV12(nv12_ptr);
} else if (auto batched_ptr = blob->as<BatchedBlob>()) {
for (auto i = 0; i < batched_ptr->size(); i++) {
auto nv12_ptr = getNV12BlobOrException(batched_ptr, i);
checkInputBlobNV12(nv12_ptr);
}
} else {
IE_THROW(ParameterMismatch) << wrong_nv12_blob;
}
auto y_ptr = nv12_ptr->y()->as<gpu::ClBlob>();
// if the blobs are not remote, check their size
if (!y_ptr) {
if (nv12_ptr->y()->buffer() == nullptr) IE_THROW() << str_not_allocated;
}
auto uv_ptr = nv12_ptr->uv()->as<gpu::ClBlob>();
if (!uv_ptr) {
if (nv12_ptr->uv()->buffer() == nullptr) IE_THROW() << str_not_allocated;
}
} else {
SizeVector dims = foundInput->getTensorDesc().getDims();
@ -498,27 +514,35 @@ void CLDNNInferRequest::SetBlob(const std::string& name, const Blob::Ptr &data)
// and put them into appropriate network inputs
// that should then go into biplanar NV12 reorder
auto nv12_ptr = data->as<NV12Blob>();
auto batched_ptr = data->as<BatchedBlob>();
if (nv12_ptr == nullptr) {
if (nv12_ptr != nullptr || batched_ptr != nullptr) {
int num_blobs = batched_ptr != nullptr ? batched_ptr->size() : 1;
for (auto i = 0; i < num_blobs; i++) {
if (batched_ptr != nullptr)
nv12_ptr = getNV12BlobOrException(batched_ptr, i);
auto y_ptr = nv12_ptr->y()->as<gpu::ClBlob>();
if (y_ptr) {
auto y_impl = getBlobImpl(y_ptr);
y_impl->allocate_if_needed();
input_attach(internalName + "_Y" + std::to_string(i), y_impl->getMemory());
is_remote = true;
}
auto uv_ptr = nv12_ptr->uv()->as<gpu::ClBlob>();
if (uv_ptr) {
auto uv_impl = getBlobImpl(uv_ptr);
uv_impl->allocate_if_needed();
input_attach(internalName + "_UV" + std::to_string(i), uv_impl->getMemory());
is_remote = true;
}
}
} else {
IE_THROW(ParameterMismatch) << wrong_nv12_blob;
}
auto y_ptr = nv12_ptr->y()->as<gpu::ClBlob>();
if (y_ptr) {
auto y_impl = getBlobImpl(y_ptr);
y_impl->allocate_if_needed();
input_attach(internalName + "_Y", y_impl->getMemory());
is_remote = true;
}
auto uv_ptr = nv12_ptr->uv()->as<gpu::ClBlob>();
if (uv_ptr) {
auto uv_impl = getBlobImpl(uv_ptr);
uv_impl->allocate_if_needed();
input_attach(internalName + "_UV", uv_impl->getMemory());
is_remote = true;
}
if (is_remote) _inputs[name] = data;
}
}
@ -582,28 +606,33 @@ void CLDNNInferRequest::AllocateInputs() {
if (ColorFormat::NV12 == ni.second->getPreProcess().getColorFormat() &&
m_graph->getConfig().nv12_two_inputs) {
cldnn::primitive_id YName(name + "_Y");
cldnn::primitive_id UVName(name + "_UV");
std::vector<Blob::Ptr> blobs;
for (auto i = 0; i < desc.getDims()[0]; i++) {
cldnn::primitive_id YName(name + "_Y" + std::to_string(i));
cldnn::primitive_id UVName(name + "_UV" + std::to_string(i));
if (inputLayouts.find(YName) == inputLayouts.end()) {
IE_THROW() << "Input layout for " << YName << " is not found";
if (inputLayouts.find(YName) == inputLayouts.end()) {
IE_THROW(ParameterMismatch) << "Input layout for " << YName << " is not found";
}
if (inputLayouts.find(UVName) == inputLayouts.end()) {
IE_THROW(ParameterMismatch) << "Input layout for " << YName << " is not found";
}
input_alloc(YName, inputLayouts.at(YName));
input_alloc(UVName, inputLayouts.at(UVName));
size_t height = desc.getDims()[2], width = desc.getDims()[3];
cldnn::pointer<uint8_t> input_mem_ptr_Y = inputsMemory.at(YName).pointer<uint8_t>();
TensorDesc ydesc(Precision::U8, { 1, 1, height, width }, Layout::NHWC);
auto blobY = createInputBlob(ydesc, input_mem_ptr_Y.data());
cldnn::pointer<uint8_t> input_mem_ptr_UV = inputsMemory.at(UVName).pointer<uint8_t>();
TensorDesc uvdesc(Precision::U8, { 1, 2, height / 2, width / 2 }, Layout::NHWC);
auto blobUV = createInputBlob(uvdesc, input_mem_ptr_UV.data());
blobs.push_back(make_shared_blob<NV12Blob>(blobY, blobUV));
}
if (inputLayouts.find(UVName) == inputLayouts.end()) {
IE_THROW() << "Input layout for " << UVName << " is not found";
}
input_alloc(YName, inputLayouts.at(YName));
input_alloc(UVName, inputLayouts.at(UVName));
_inputs[name] = desc.getDims()[0] == 1 ? blobs[0] : make_shared_blob<BatchedBlob>(blobs);
size_t height = desc.getDims()[2], width = desc.getDims()[3];
cldnn::pointer<uint8_t> input_mem_ptr_Y = inputsMemory.at(YName).pointer<uint8_t>();
TensorDesc ydesc(Precision::U8, { 1, 1, height, width }, Layout::NHWC);
auto blobY = createInputBlob(ydesc, input_mem_ptr_Y.data());
cldnn::pointer<uint8_t> input_mem_ptr_UV = inputsMemory.at(UVName).pointer<uint8_t>();
TensorDesc uvdesc(Precision::U8, { 1, 2, height / 2, width / 2 }, Layout::NHWC);
auto blobUV = createInputBlob(uvdesc, input_mem_ptr_UV.data());
_inputs[name] = make_shared_blob<NV12Blob>(blobY, blobUV);
} else {
if (inputLayouts.find(name) == inputLayouts.end()) {
IE_THROW() << "Input layout for " << name << " is not found";
@ -868,14 +897,21 @@ void CLDNNInferRequest::InferImpl() {
PrepareInputDyn(name, *inputBlob);
} else {
auto nv12_ptr = inputBlob->as<NV12Blob>();
auto batched_ptr = inputBlob->as<BatchedBlob>();
if (nv12_ptr == nullptr) {
if (nv12_ptr != nullptr || batched_ptr != nullptr) {
// special case for NV12 input blob
int num_blobs = batched_ptr != nullptr ? batched_ptr->size() : 1;
for (auto i = 0; i < num_blobs; i++) {
if (batched_ptr != nullptr)
nv12_ptr = getNV12BlobOrException(batched_ptr, i);
PrepareInput(name + "_Y" + std::to_string(i), *nv12_ptr->y());
PrepareInput(name + "_UV" + std::to_string(i), *nv12_ptr->uv());
}
} else {
// regular blob
PrepareInput(name, *inputBlob);
} else {
// special case for NV12 input blob
PrepareInput(name + "_Y", *nv12_ptr->y());
PrepareInput(name + "_UV", *nv12_ptr->uv());
}
}
}

View File

@ -10,6 +10,7 @@
#include "api/input_layout.hpp"
#include "api/reorder.hpp"
#include "api/data.hpp"
#include "api/concatenation.hpp"
using namespace InferenceEngine;
@ -97,7 +98,6 @@ void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::Paramet
networkInputLayout.format = inputFormat;
networkInputLayout.size = networkInputLayout.size.transform(inputFormat, 1);
networkInputLayout.data_type = DataTypeFromPrecision(op->get_output_element_type(0));
auto preprocessPrimID = "reorder:" + inputName + Program::m_preProcessTag;
cldnn::primitive_id meanBlobID = inputName + Program::m_meanValuesTag;
std::vector<float> meanValues;
@ -184,41 +184,55 @@ void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::Paramet
}
int height = inputDims[2];
int width = inputDims[3];
std::vector<cldnn::primitive_id> reorders;
std::string y_name = inputName + "_Y";
std::string uv_name = inputName + "_UV";
for (auto i = 0; i < inputDims[0]; i++) {
auto preprocessPrimID = "reorder:" + inputName + std::to_string(i) + Program::m_preProcessTag;
std::string y_name = inputName + "_Y" + std::to_string(i);
std::string uv_name = inputName + "_UV" + std::to_string(i);
cldnn::layout y_layout(DataTypeFromPrecision(ip),
cldnn::format::nv12, { 1, 1, width, height });
cldnn::layout uv_layout(DataTypeFromPrecision(ip),
cldnn::format::nv12, { 1, 2, width / 2, height / 2 });
auto inputY = cldnn::input_layout(y_name, y_layout);
auto inputUV = cldnn::input_layout(uv_name, uv_layout);
cldnn::layout y_layout(DataTypeFromPrecision(ip),
cldnn::format::nv12, { 1, 1, width, height });
cldnn::layout uv_layout(DataTypeFromPrecision(ip),
cldnn::format::nv12, { 1, 2, width / 2, height / 2 });
auto inputY = cldnn::input_layout(y_name, y_layout);
auto inputUV = cldnn::input_layout(uv_name, uv_layout);
p.AddPrimitive(inputY);
p.inputLayouts.insert({ inputInfo->name() + "_Y", y_layout });
p.AddPrimitive(inputUV);
p.inputLayouts.insert({ inputInfo->name() + "_UV", uv_layout });
switch (preProcess.getMeanVariant()) {
case NONE:
case MEAN_VALUE: {
p.AddPrimitive(cldnn::reorder(preprocessPrimID, y_name, uv_name, networkInputLayout, meanValues));
break;
}
case MEAN_IMAGE: {
p.AddPrimitive(cldnn::reorder(preprocessPrimID, y_name, uv_name, networkInputLayout, meanBlobID));
break;
}
default: IE_THROW() << "Invalid mean variant in input " + inputName;
break;
p.AddPrimitive(inputY);
p.inputLayouts.insert({ inputInfo->name() + "_Y" + std::to_string(i), y_layout });
p.AddPrimitive(inputUV);
p.inputLayouts.insert({ inputInfo->name() + "_UV" + std::to_string(i), uv_layout });
switch (preProcess.getMeanVariant()) {
case NONE:
case MEAN_VALUE: {
p.AddPrimitive(cldnn::reorder(preprocessPrimID, y_name, uv_name, networkInputLayout, meanValues));
break;
}
case MEAN_IMAGE: {
p.AddPrimitive(cldnn::reorder(preprocessPrimID, y_name, uv_name, networkInputLayout, meanBlobID));
break;
}
default: IE_THROW(Unexpected) << "Invalid mean variant in input " + inputName;
break;
}
p.primitivesToIRLayersMap[preprocessPrimID] = { inputInfo->name() };
p.primitivesToIRLayersMap[y_name] = { inputInfo->name() };
p.primitivesToIRLayersMap[uv_name] = { inputInfo->name() };
p.profilingIDs.push_back(preprocessPrimID);
p.InitProfileInfo(preprocessPrimID, "Reorder");
p.primitiveIDs[inputName] = preprocessPrimID; // If it is batched blob, it will be overwritten afterwards.
p.primitiveIDs[preprocessPrimID] = preprocessPrimID;
reorders.push_back(preprocessPrimID);
}
p.primitivesToIRLayersMap[preprocessPrimID] = { inputInfo->name() };
p.primitivesToIRLayersMap[y_name] = { inputInfo->name() };
p.primitivesToIRLayersMap[uv_name] = { inputInfo->name() };
p.profilingIDs.push_back(preprocessPrimID);
p.InitProfileInfo(preprocessPrimID, "Reorder");
if (inputDims[0] > 1) {
auto concatPrimID = "concat:" + inputName + Program::m_preProcessTag;
p.AddPrimitive(cldnn::concatenation(concatPrimID, reorders, cldnn::concatenation::along_b));
p.primitiveIDs[inputName] = concatPrimID;
}
} else {
auto preprocessPrimID = "reorder:" + inputName + Program::m_preProcessTag;
cldnn::layout inputLayout(networkInputLayout);
inputLayout.data_type = DataTypeFromPrecision(ip);
p.inputLayouts.insert({ inputInfo->name(), inputLayout });
@ -244,11 +258,9 @@ void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::Paramet
}
p.InitProfileInfo(preprocessPrimID, "reorder");
p.primitiveIDs[preprocessPrimID] = preprocessPrimID;
p.primitiveIDs[inputName] = preprocessPrimID;
p.profilingIDs.push_back(preprocessPrimID);
}
p.primitiveIDs[inputName] = preprocessPrimID;
p.primitiveIDs[preprocessPrimID] = preprocessPrimID;
}
REGISTER_FACTORY_IMPL(v0, Parameter);

View File

@ -127,6 +127,150 @@ TEST_F(RemoteBlob_Test, smoke_canInferOnUserContext) {
}
}
class BatchedBlob_Test : public CommonTestUtils::TestsCommon, public testing::WithParamInterface<size_t> {
void SetUp() override {
num_batch = this->GetParam();
};
public:
static std::string getTestCaseName(const testing::TestParamInfo<std::size_t> &obj) {
return "num_batch_" + std::to_string(obj.param);
}
protected:
size_t num_batch;
std::vector<std::shared_ptr<ngraph::Function>> fn_ptrs;
};
TEST_P(BatchedBlob_Test, canInputNV12) {
#if defined(_WIN32) || defined(ANDROID)
GTEST_SKIP();
#endif
const int height = 16;
const int width = 16;
// ------------------------------------------------------
// Prepare input data
const InferenceEngine::TensorDesc y_plane_desc(InferenceEngine::Precision::U8, {1, 1, height, width},
InferenceEngine::Layout::NHWC);
const InferenceEngine::TensorDesc uv_plane_desc(InferenceEngine::Precision::U8, {1, 2, height / 2, width / 2},
InferenceEngine::Layout::NHWC);
std::vector<InferenceEngine::Blob::Ptr> fake_image_data_y;
std::vector<InferenceEngine::Blob::Ptr> fake_image_data_uv;
for (int i = 0; i < num_batch; i++) {
fake_image_data_y.push_back(FuncTestUtils::createAndFillBlob(y_plane_desc, 50, 0, 1, i));
fake_image_data_uv.push_back(FuncTestUtils::createAndFillBlob(uv_plane_desc, 256, 0, 1, i));
}
auto ie = InferenceEngine::Core();
// ------------------------------------------------------
// inference using remote blob with batch
auto fn_ptr_remote = ngraph::builder::subgraph::makeConvPoolRelu({num_batch, 3, height, width});
CNNNetwork net_remote(fn_ptr_remote);
net_remote.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
net_remote.getInputsInfo().begin()->second->setPrecision(Precision::U8);
net_remote.getInputsInfo().begin()->second->getPreProcess().setColorFormat(ColorFormat::NV12);
/* XXX: is it correct to set KEY_CLDNN_NV12_TWO_INPUTS in case of remote blob? */
auto exec_net_b = ie.LoadNetwork(net_remote, CommonTestUtils::DEVICE_GPU,
{ { CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS, PluginConfigParams::YES} });
auto inf_req_remote = exec_net_b.CreateInferRequest();
auto cldnn_context = exec_net_b.GetContext();
cl_context ctx = std::dynamic_pointer_cast<ClContext>(cldnn_context)->get();
auto ocl_instance = std::make_shared<OpenCL>(ctx);
cl_int err;
std::vector<cl_mem> nv12_image_plane_y, nv12_image_plane_uv;
std::vector<cl::Image2D> img_y, img_uv;
std::vector<Blob::Ptr> blob_remote;
for (int i = 0; i < num_batch; i++) {
cl_image_format image_format;
cl_image_desc image_desc = { 0 };
image_format.image_channel_order = CL_R;
image_format.image_channel_data_type = CL_UNORM_INT8;
image_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
image_desc.image_width = width;
image_desc.image_height = height;
nv12_image_plane_y.push_back(clCreateImage(ocl_instance->_context.get(), CL_MEM_READ_WRITE, &image_format, &image_desc, NULL, &err));
ASSERT_EQ(err, 0);
image_format.image_channel_order = CL_RG;
image_desc.image_width = width / 2;
image_desc.image_height = height / 2;
nv12_image_plane_uv.push_back(clCreateImage(ocl_instance->_context.get(), CL_MEM_READ_WRITE, &image_format, &image_desc, NULL, &err));
ASSERT_EQ(err, 0);
size_t origin[3] = { 0, 0, 0 };
size_t y_region[3] = { (size_t)width, (size_t)height, 1 };
size_t uv_region[3] = { (size_t)width / 2, (size_t)height / 2, 1 };
err = clEnqueueWriteImage(ocl_instance->_queue.get(), nv12_image_plane_y[i],
true, origin, y_region, 0, 0, fake_image_data_y[i]->buffer(), 0, NULL, NULL);
ASSERT_EQ(err, 0);
err = clEnqueueWriteImage(ocl_instance->_queue.get(), nv12_image_plane_uv[i],
true, origin, uv_region, 0, 0, fake_image_data_uv[i]->buffer(), 0, NULL, NULL);
ASSERT_EQ(err, 0);
img_y.push_back(cl::Image2D(nv12_image_plane_y[i]));
img_uv.push_back(cl::Image2D(nv12_image_plane_uv[i]));
blob_remote.push_back(make_shared_blob_nv12(cldnn_context, img_y[i], img_uv[i]));
}
if (num_batch == 1) {
inf_req_remote.SetBlob(net_remote.getInputsInfo().begin()->first, blob_remote[0]);
} else {
auto batched_blob = make_shared_blob<BatchedBlob>(blob_remote);
inf_req_remote.SetBlob(net_remote.getInputsInfo().begin()->first, batched_blob);
}
inf_req_remote.Infer();
auto outputBlob_shared = inf_req_remote.GetBlob(net_remote.getOutputsInfo().begin()->first);
// ------------------------------------------------------
// Setup to inference using local blob with batch=1
auto fn_ptr_local = ngraph::builder::subgraph::makeConvPoolRelu({1, 3, height, width});
CNNNetwork net_local(fn_ptr_local);
net_local.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
net_local.getInputsInfo().begin()->second->setPrecision(Precision::U8);
net_local.getInputsInfo().begin()->second->getPreProcess().setColorFormat(ColorFormat::NV12);
auto exec_net_b1 = ie.LoadNetwork(net_local, CommonTestUtils::DEVICE_GPU);
auto inf_req_local = exec_net_b1.CreateInferRequest();
// Run regular input for each image and compare against batched blob
for (int i = 0; i < num_batch; i++) {
auto y_blob = make_shared_blob<uint8_t>(y_plane_desc, fake_image_data_y[i]->buffer().as<uint8_t *>());
auto uv_blob = make_shared_blob<uint8_t>(uv_plane_desc, fake_image_data_uv[i]->buffer().as<uint8_t *>());
auto blob = make_shared_blob<NV12Blob>(y_blob, uv_blob);
inf_req_local.SetBlob(net_local.getInputsInfo().begin()->first, blob);
inf_req_local.Infer();
auto output_blob_local = inf_req_local.GetBlob(net_local.getOutputsInfo().begin()->first);
// This network generates [1, size] tensor whether batch=1 or 2. So need to split
auto split_shared_blob = make_shared_blob<float_t>(output_blob_local->getTensorDesc(),
outputBlob_shared->buffer().as<float_t *>() + output_blob_local->size() * i);
ASSERT_EQ(output_blob_local->size(), split_shared_blob->size());
float thr = 0.1;
FuncTestUtils::compareBlobs(output_blob_local, split_shared_blob, thr, "", false);
}
}
const std::vector<size_t> num_batches{1, 2, 4};
INSTANTIATE_TEST_CASE_P(smoke_RemoteBlob, BatchedBlob_Test, ::testing::ValuesIn(num_batches), BatchedBlob_Test::getTestCaseName);
class TwoNets_Test : public CommonTestUtils::TestsCommon, public testing::WithParamInterface<size_t> {
void SetUp() override {
num_streams = this->GetParam();
@ -211,6 +355,6 @@ TEST_P(TwoNets_Test, canInferTwoExecNets) {
}
}
const std::vector<size_t> num_strems{1, 2};
const std::vector<size_t> num_streams{1, 2};
INSTANTIATE_TEST_CASE_P(smoke_RemoteBlob, TwoNets_Test, ::testing::ValuesIn(num_strems), TwoNets_Test::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_RemoteBlob, TwoNets_Test, ::testing::ValuesIn(num_streams), TwoNets_Test::getTestCaseName);

View File

@ -10,8 +10,9 @@ using namespace InferenceEngine;
const std::vector<FuncTestUtils::BlobKind> blobKinds = {
FuncTestUtils::BlobKind::Simple,
FuncTestUtils::BlobKind::Compound,
FuncTestUtils::BlobKind::BatchOfSimple
FuncTestUtils::BlobKind::Compound
/* BatchOfSimple is not supported on GPU currently. Batch of remote is supported */
/* , FuncTestUtils::BlobKind::BatchOfSimple */
};
const SetBlobOfKindConfig gpuConfig{}; //nothing special