[GPU] Add batching surface to new API (#9435)

This commit is contained in:
Roman Lyamin
2021-12-27 09:46:37 +03:00
committed by GitHub
parent d6dcf58846
commit b050d39f89
6 changed files with 315 additions and 15 deletions

View File

@@ -324,6 +324,7 @@ void SetExeNetworkInfo(const std::shared_ptr<IExecutableNetworkInternal>& exeNet
new_param->set_element_type(
InferenceEngine::details::convertPrecision(inputsInfo.at(new_param->get_friendly_name())->getPrecision()));
new_param->set_layout(param->get_layout());
new_param->output(0).get_rt_info() = param->output(0).get_rt_info();
new_param->validate_and_infer_types();
const_params.emplace_back(new_param);
}

View File

@@ -45,6 +45,7 @@ public:
InferenceEngine::Blob::Ptr GetBlob(const std::string& name) override;
void SetBlob(const std::string& name, const InferenceEngine::Blob::Ptr &data) override;
void SetBlobs(const std::string& name, const std::vector<InferenceEngine::Blob::Ptr> &data) override;
void SetBatch(int batch = -1) override;
void SetGraph(std::shared_ptr<Graph> graph);
@@ -72,6 +73,8 @@ private:
std::map<std::string, cldnn::primitive_id> inputsMap;
std::map<std::string, cldnn::primitive_id> outputsMap;
std::map<std::string, std::vector<InferenceEngine::Blob::Ptr>> inputTensorsMap;
bool m_useProfiling = false;
bool m_useStreams = false;
bool m_useExternalQueue = false;

View File

@@ -13,6 +13,7 @@
#include "intel_gpu/plugin/compiled_model.hpp"
#include "intel_gpu/plugin/itt.hpp"
#include "intel_gpu/runtime/debug_configuration.hpp"
#include "openvino/core/preprocess/input_tensor_info.hpp"
#include <ie_algorithm.hpp>
#include <debug.h>
@@ -218,6 +219,9 @@ void InferRequest::SetBlob(const std::string& name, const Blob::Ptr& data) {
if (0 == dataSize) {
IE_THROW() << "Input data is empty. Input name: \'" << name << "\'";
}
if (inputTensorsMap.find(name) != inputTensorsMap.end()) {
inputTensorsMap.erase(name);
}
const bool compoundBlobPassed = data->is<CompoundBlob>();
InputInfo::Ptr foundInput;
@@ -352,6 +356,88 @@ void InferRequest::SetBlob(const std::string& name, const Blob::Ptr& data) {
}
}
void InferRequest::SetBlobs(const std::string& name, const std::vector<Blob::Ptr>& blobs) {
if (blobs.size() == 1) {
SetBlob(name, blobs[0]);
return;
}
if (name.empty()) {
IE_THROW(NotFound) << "Failed to set blobs with empty name";
}
if (blobs.empty()) {
IE_THROW(NotAllocated) << "Failed to set empty blobs with name: \'" << name << "\'";
}
bool empty_data = std::any_of(blobs.begin(), blobs.end(), [](const Blob::Ptr& blob) {
return blob->size() == 0;
});
if (empty_data) {
IE_THROW() << "At least one of the input blobs is empty. Input name: \'" << name << "\'";
}
bool is_compound = std::any_of(blobs.begin(), blobs.end(), [](const Blob::Ptr& blob) {
return blob->is<CompoundBlob>();
});
if (is_compound) {
IE_THROW(NotImplemented) << cannot_set_compound;
}
bool is_buffer = std::all_of(blobs.begin(), blobs.end(), [](const Blob::Ptr& blob) {
return blob->is<gpu::ClBufferBlob>();
});
bool is_surface = std::all_of(blobs.begin(), blobs.end(), [](const Blob::Ptr& blob) {
return blob->is<gpu::ClImage2DBlob>();
});
bool is_remote = is_buffer || is_surface;
bool is_host = std::all_of(blobs.begin(), blobs.end(), [](const Blob::Ptr& blob) {
return blob->is<InferenceEngine::MemoryBlob>();
});
is_host &= !is_remote;
if (!is_host && !is_remote) {
IE_THROW() << "Incorrect input blobs. All blobs must be of the same type";
}
InputInfo::Ptr foundInput;
DataPtr foundOutput;
bool is_input = findInputAndOutputBlobByName(name, foundInput, foundOutput);
if (!is_input) {
IE_THROW() << "SetBlobs method doesn't support outputs";
}
if (is_buffer) {
IE_THROW(NotImplemented) << "SetBlobs method doesn't support buffer blobs";
}
const TensorDesc& desc = foundInput->getTensorDesc();
size_t dataBinSize = blobs.front()->size() * blobs.front()->element_size() * blobs.size();
size_t netReqBinSize = std::accumulate(desc.getDims().begin(), desc.getDims().end(),
desc.getPrecision().size(),
std::multiplies<size_t>());
if (dataBinSize != netReqBinSize) {
IE_THROW() << "Incorrect binary data size for input blobs with name: \'" << name << "\' " <<
"Current: " << dataBinSize << " Required: " << netReqBinSize;
}
if (_inputs.find(name) != _inputs.end()) {
_inputs.erase(name);
}
if (is_remote) {
for (auto& blob : blobs) {
auto impl = getBlobImpl(blob->as<gpu::ClBlob>());
if (!impl->is_allocated()) {
impl->allocate();
}
}
}
inputTensorsMap.insert({ name, blobs });
}
void InferRequest::checkBlobs() {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::checkBlobs");
for (auto const &input : _inputs) {
@@ -519,6 +605,24 @@ void InferRequest::enqueue() {
// set input and output memory from request blob maps
// into the network object primitives
std::vector<cldnn::event::ptr> dependencies;
for (const auto& inputTensor : inputTensorsMap) {
const auto& blobs = inputTensor.second;
auto blobsDesc = blobs.front()->getTensorDesc();
bool is_surface = std::all_of(blobs.begin(), blobs.end(), [](const Blob::Ptr& blob) {
return blob->is<gpu::ClImage2DBlob>();
});
if (is_surface) {
for (size_t i = 0; i < blobs.size(); ++i) {
std::string new_name = inputTensor.first + "_" + std::to_string(i);
_inputs[new_name] = blobs[i];
_deviceInputs[new_name] = blobs[i];
}
}
}
for (auto& item : _inputs) {
std::string inputName = item.first;
Blob::Ptr& inputBlob = item.second;
@@ -809,14 +913,30 @@ Blob::Ptr InferRequest::host_blob_from_device_blob(Blob::Ptr blobPtr) {
void InferRequest::allocate_inputs() {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::allocate_inputs");
auto inputLayouts = m_graph->GetInputLayouts();
// allocate inputs
for (auto& ni : _networkInputs) {
std::string name = ni.first;
const TensorDesc& desc = ni.second->getTensorDesc();
if (ColorFormat::NV12 == ni.second->getPreProcess().getColorFormat() &&
m_graph->getConfig().nv12_two_inputs) {
} else {
bool is_nv12_input = ColorFormat::NV12 == ni.second->getPreProcess().getColorFormat() &&
m_graph->getConfig().nv12_two_inputs;
auto parameter = std::find_if(_parameters.begin(), _parameters.end(), [&](const std::shared_ptr<const ov::Node>& node) {
return node->get_friendly_name() == name;
});
if (parameter != _parameters.end()) {
if (parameter->get()->output(0).get_rt_info().count(ov::preprocess::TensorInfoMemoryType::get_type_info_static())) {
std::string mem_type = parameter->get()->output(0).get_rt_info().at(ov::preprocess::TensorInfoMemoryType::get_type_info_static())
.as<ov::preprocess::TensorInfoMemoryType>().value;
if (mem_type.find(GPU_CONFIG_KEY(SURFACE)) != std::string::npos) {
is_nv12_input = true;
}
}
}
if (!is_nv12_input) {
auto litr = inputLayouts.find(name);
if (litr == inputLayouts.end()) {
IE_THROW() << "Input layout for " << name << " is not found";

View File

@@ -6,6 +6,7 @@
#include "intel_gpu/plugin/common_utils.hpp"
#include "intel_gpu/primitives/convert_color.hpp"
#include "intel_gpu/primitives/concatenation.hpp"
#include "openvino/core/preprocess/input_tensor_info.hpp"
namespace ov {
@@ -32,13 +33,34 @@ static void CreateCommonConvertColorOp(Program& p, const std::shared_ptr<ngraph:
memory_type = cldnn::convert_color::memory_type::image;
}
}
p.AddPrimitive(cldnn::convert_color(layerName,
inputPrimitives,
from_color,
to_color,
memory_type,
out_layout,
op->get_friendly_name()));
if (outShape.batch[0] > 1 && memory_type == cldnn::convert_color::memory_type::image) {
std::vector<cldnn::primitive_id> convert_color_names;
for (size_t b = 0; b < outShape.batch[0]; ++b) {
cldnn::primitive::primitive_id_arr batchedInputPrimitives = { inputPrimitives[0] + "_" + std::to_string(b),
inputPrimitives[1] + "_" + std::to_string(b)};
cldnn::primitive_id batched_prim_id = layerName + "_" + std::to_string(b);
convert_color_names.emplace_back(batched_prim_id);
out_layout.size.batch[0] = 1;
p.AddPrimitive(cldnn::convert_color(batched_prim_id,
batchedInputPrimitives,
from_color,
to_color,
memory_type,
out_layout,
op->get_friendly_name()));
}
p.AddPrimitive(cldnn::concatenation(layerName, convert_color_names, cldnn::concatenation::along_b, op->get_friendly_name()));
} else {
p.AddPrimitive(cldnn::convert_color(layerName,
inputPrimitives,
from_color,
to_color,
memory_type,
out_layout,
op->get_friendly_name()));
}
p.AddPrimitiveToProfiler(op);
}

View File

@@ -196,12 +196,26 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
networkInputLayout.format = cldnn::format::nv12;
}
}
networkInputLayout.size = { TensorValue(inputDims[0]), TensorValue(inputDims[3]),
TensorValue(inputDims[2]), TensorValue(inputDims[1]) };
p.inputLayouts.insert({ inputInfo->name(), networkInputLayout });
p.AddPrimitive(cldnn::input_layout(inputName, networkInputLayout, inputInfo->name()));
p.AddPrimitiveToProfiler(op);
if (networkInputLayout.format == cldnn::format::nv12 && networkInputLayout.size.batch[0] > 1) {
networkInputLayout.size = { 1, TensorValue(inputDims[3]), TensorValue(inputDims[2]), TensorValue(inputDims[1]) };
std::vector<cldnn::primitive_id> inputs;
for (size_t i = 0; i < inputDims[0]; ++i) {
std::string batched_name = inputName + "_" + std::to_string(i);
p.inputLayouts.insert({ inputInfo->name() + "_" + std::to_string(i), networkInputLayout });
inputs.emplace_back(batched_name);
p.AddPrimitive(cldnn::input_layout(batched_name, networkInputLayout, inputInfo->name()));
p.AddPrimitiveToProfiler(op);
}
} else {
networkInputLayout.size = { TensorValue(inputDims[0]), TensorValue(inputDims[3]),
TensorValue(inputDims[2]), TensorValue(inputDims[1]) };
p.inputLayouts.insert({ inputInfo->name(), networkInputLayout });
p.AddPrimitive(cldnn::input_layout(inputName, networkInputLayout, inputInfo->name()));
p.AddPrimitiveToProfiler(op);
}
} else {
if (ColorFormat::NV12 == preProcess.getColorFormat() && p.GetConfig().nv12_two_inputs) {
// for NV12, create two input layouts with reorder instead of one,

View File

@@ -742,3 +742,143 @@ TEST_F(OVRemoteTensor_Test, NV12toBGR_buffer) {
float thr = 0.1;
FuncTestUtils::compare_tensor(out_tensor, output_tensor_regular, thr);
}
class OVRemoteTensorBatched_Test : public CommonTestUtils::TestsCommon, public testing::WithParamInterface<size_t> {
void SetUp() override {
num_batch = this->GetParam();
};
public:
static std::string getTestCaseName(const testing::TestParamInfo<std::size_t> &obj) {
return "num_batch_" + std::to_string(obj.param);
}
protected:
size_t num_batch;
std::vector<std::shared_ptr<ngraph::Function>> fn_ptrs;
};
TEST_P(OVRemoteTensorBatched_Test, NV12toBGR_image) {
#if defined(ANDROID)
GTEST_SKIP();
#endif
const int height = 16;
const int width = 16;
// ------------------------------------------------------
// Prepare input data
std::vector<ov::runtime::Tensor> fake_image_data_y, fake_image_data_uv;
for (int i = 0; i < num_batch; i++) {
fake_image_data_y.push_back(FuncTestUtils::create_and_fill_tensor(ov::element::u8, {1, 1, height, width}, 50, 0, 1, i));
fake_image_data_uv.push_back(FuncTestUtils::create_and_fill_tensor(ov::element::u8, {1, 2, height / 2, width / 2}, 256, 0, 1, i));
}
auto ie = ov::runtime::Core();
// ------------------------------------------------------
// inference using remote tensor
auto fn_ptr_remote = ngraph::builder::subgraph::makeConvPoolRelu({num_batch, 3, height, width});
using namespace ov::preprocess;
auto p = PrePostProcessor(fn_ptr_remote);
p.input().tensor().set_element_type(ov::element::u8)
.set_color_format(ov::preprocess::ColorFormat::NV12_TWO_PLANES, {"y", "uv"})
.set_memory_type(std::string(GPU_CONFIG_KEY(SURFACE)) + GPU_CONFIG_KEY(BATCHED));
p.input().preprocess().convert_color(ov::preprocess::ColorFormat::BGR);
p.input().model().set_layout("NCHW");
auto function = p.build();
auto param_input_y = fn_ptr_remote->get_parameters().at(0);
auto param_input_uv = fn_ptr_remote->get_parameters().at(1);
auto exec_net_b = ie.compile_model(function, CommonTestUtils::DEVICE_GPU);
auto inf_req_remote = exec_net_b.create_infer_request();
auto cldnn_context = exec_net_b.get_context().as<ov::runtime::intel_gpu::ocl::ClContext>();
cl_context ctx = cldnn_context.get();
auto ocl_instance = std::make_shared<OpenCL>(ctx);
cl_int err;
std::vector<cl_mem> nv12_image_plane_y, nv12_image_plane_uv;
std::vector<cl::Image2D> img_y, img_uv;
std::vector<ov::runtime::Tensor> tensor_remote_y, tensor_remote_uv;
for (size_t i = 0; i < num_batch; ++i) {
cl_image_format image_format;
cl_image_desc image_desc = { 0 };
image_format.image_channel_order = CL_R;
image_format.image_channel_data_type = CL_UNORM_INT8;
image_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
image_desc.image_width = width;
image_desc.image_height = height;
nv12_image_plane_y.emplace_back(clCreateImage(ocl_instance->_context.get(), CL_MEM_READ_WRITE, &image_format, &image_desc, NULL, &err));
ASSERT_EQ(err, 0);
image_format.image_channel_order = CL_RG;
image_desc.image_width = width / 2;
image_desc.image_height = height / 2;
nv12_image_plane_uv.emplace_back(clCreateImage(ocl_instance->_context.get(), CL_MEM_READ_WRITE, &image_format, &image_desc, NULL, &err));
ASSERT_EQ(err, 0);
size_t origin[3] = { 0, 0, 0 };
size_t y_region[3] = { (size_t)width, (size_t)height, 1 };
size_t uv_region[3] = { (size_t)width / 2, (size_t)height / 2, 1 };
err = clEnqueueWriteImage(ocl_instance->_queue.get(), nv12_image_plane_y[i],
true, origin, y_region, 0, 0, fake_image_data_y[i].data(), 0, NULL, NULL);
ASSERT_EQ(err, 0);
err = clEnqueueWriteImage(ocl_instance->_queue.get(), nv12_image_plane_uv[i],
true, origin, uv_region, 0, 0, fake_image_data_uv[i].data(), 0, NULL, NULL);
ASSERT_EQ(err, 0);
img_y.emplace_back(nv12_image_plane_y[i]);
img_uv.emplace_back(nv12_image_plane_uv[i]);
tensor_remote_y.emplace_back(cldnn_context.create_tensor(param_input_y->get_element_type(), fake_image_data_y[i].get_shape(), img_y[i]));
tensor_remote_uv.emplace_back(cldnn_context.create_tensor(param_input_uv->get_element_type(), fake_image_data_uv[i].get_shape(), img_uv[i]));
}
inf_req_remote.set_tensors(*param_input_y->output(0).get_tensor().get_names().begin(), tensor_remote_y);
inf_req_remote.set_tensors(*param_input_uv->output(0).get_tensor().get_names().begin(), tensor_remote_uv);
inf_req_remote.infer();
auto output_tensor_shared = inf_req_remote.get_tensor(function->get_results().at(0));
ASSERT_NO_THROW(output_tensor_shared.data());
// ------------------------------------------------------
// regular inference
auto fn_ptr_regular = ngraph::builder::subgraph::makeConvPoolRelu({1, 3, height, width});
using namespace ov::preprocess;
auto p_reg = PrePostProcessor(fn_ptr_regular);
p_reg.input().tensor().set_element_type(ov::element::u8)
.set_color_format(ov::preprocess::ColorFormat::NV12_TWO_PLANES, {"y", "uv"})
.set_memory_type(GPU_CONFIG_KEY(BUFFER));
p_reg.input().preprocess().convert_color(ov::preprocess::ColorFormat::BGR);
p_reg.input().model().set_layout("NCHW");
auto function_regular = p_reg.build();
auto param_input_y_reg = fn_ptr_regular->get_parameters().at(0);
auto param_input_uv_reg = fn_ptr_regular->get_parameters().at(1);
auto exec_net_regular = ie.compile_model(function_regular, CommonTestUtils::DEVICE_GPU);
auto inf_req_regular = exec_net_regular.create_infer_request();
for (size_t i = 0; i < num_batch; ++i) {
inf_req_regular.set_tensor(param_input_y_reg, fake_image_data_y[i]);
inf_req_regular.set_tensor(param_input_uv_reg, fake_image_data_uv[i]);
inf_req_regular.infer();
auto output_tensor_regular = inf_req_regular.get_tensor(exec_net_regular.output());
ASSERT_EQ(output_tensor_regular.get_size() * num_batch, output_tensor_shared.get_size());
float thr = 0.1;
FuncTestUtils::compareRawBuffers<float>(static_cast<float*>(output_tensor_shared.data()) + i * output_tensor_regular.get_size(),
static_cast<float*>(output_tensor_regular.data()),
output_tensor_regular.get_size(), output_tensor_regular.get_size(), thr);
}
}
const std::vector<size_t> num_batches{ 1, 2, 4 };
INSTANTIATE_TEST_SUITE_P(smoke_RemoteTensor, OVRemoteTensorBatched_Test, ::testing::ValuesIn(num_batches), OVRemoteTensorBatched_Test::getTestCaseName);