[GPU] Fixes for infer request impl in dynamic cases (#13050)

* [GPU] Fixes for infer request impl in dynamic cases

* [GPU] Fixed incosistent output shapes for LSTMSequence op

* [GPU] Update network::get_output_layout method

* [GPU] WA for USM memory allocations with 0 bytes
This commit is contained in:
Vladimir Paramuzov
2022-10-19 09:35:03 +04:00
committed by GitHub
parent dc1fe22fed
commit ef12d3976f
11 changed files with 123 additions and 76 deletions

View File

@@ -121,6 +121,7 @@ public:
}
memory::ptr get_output_memory(const primitive_id& output_id);
layout get_node_output_layout(const primitive_id& output_id) const;
/// @brief Returns the list of primitive ids before and after graph optimization.
/// @details If primitive was not optimized, the old and actual id will be the same.
@@ -178,6 +179,7 @@ public:
void set_arguments();
// Implementation specific calls
std::shared_ptr<primitive_inst> get_primitive(const primitive_id& id);
std::shared_ptr<const primitive_inst> get_primitive(const primitive_id& id) const;
std::string get_primitive_info(const primitive_id& id) const;
std::string get_implementation_info(const primitive_id& id) const;
const event::ptr& get_primitive_event(const primitive_id& id) const { return _events.at(id); }

View File

@@ -74,8 +74,7 @@ private:
std::vector<cldnn::event::ptr>& dependencies);
void prepare_output(const cldnn::primitive_id& outputName, InferenceEngine::Blob::Ptr& outputBlob);
InferenceEngine::Blob::Ptr create_host_blob(const InferenceEngine::TensorDesc& desc,
std::shared_ptr<InferenceEngine::IAllocator> alloc = nullptr);
InferenceEngine::Blob::Ptr create_host_blob(const InferenceEngine::TensorDesc& desc);
InferenceEngine::Blob::Ptr create_device_blob(const InferenceEngine::TensorDesc& desc);
void copy_output_data(cldnn::memory::ptr outputMemory, InferenceEngine::Blob::Ptr bptr);

View File

@@ -195,11 +195,14 @@ public:
const std::unordered_map<size_t, std::tuple<int64_t, size_t>>& get_profiling_data() const { return _profiling_data; }
const std::unordered_map<size_t, instrumentation::perf_counter_key>& get_profiling_info() const { return _profiling_info; }
layout get_node_output_layout() const { return _node_output_layout; }
protected:
primitive_inst(network& network, program_node const& node, bool allocate_memory);
network& _network;
program_node const& _node;
const layout _node_output_layout;
std::unique_ptr<kernel_impl_params> _impl_params;
std::unique_ptr<primitive_impl> _impl;

View File

@@ -587,6 +587,15 @@ memory::ptr network::get_output_memory(const primitive_id& output_id) {
return get_primitive(output_id)->output_memory_ptr();
}
layout network::get_node_output_layout(const primitive_id& output_id) const {
auto res = std::find_if(_outputs.begin(), _outputs.end(), [&](const std::shared_ptr<primitive_inst>& v) {
return v->id() == output_id;
});
OPENVINO_ASSERT(res != _outputs.end(), "[GPU] Couldn't get output layout for ", output_id, ". Output with such name is not found in the outputs list");
return (*res)->get_node_output_layout();
}
void network::allocate_primitives() {
std::vector<std::shared_ptr<program_node>> nodes_to_allocate{};
auto& po = _program->get_processing_order();
@@ -887,6 +896,11 @@ std::shared_ptr<primitive_inst> network::get_primitive(const primitive_id& id) {
return _primitives.at(id);
}
std::shared_ptr<const primitive_inst> network::get_primitive(const primitive_id& id) const {
OPENVINO_ASSERT(_primitives.count(id) == 1, "[GPU] Can't get primitive with ", id, " id: primitive with such name hasn't been found in processing order");
return _primitives.at(id);
}
std::vector<std::shared_ptr<primitive_inst>> network::get_primitives(const std::vector<primitive_id>& ids) {
std::vector<std::shared_ptr<primitive_inst>> result(ids.size());
std::transform(std::begin(ids), std::end(ids), std::begin(result), [&](const primitive_id& id) {

View File

@@ -438,6 +438,7 @@ void primitive_inst::build_deps() {
primitive_inst::primitive_inst(network& network, program_node const& node, bool allocate_memory)
: _network(network)
, _node(node)
, _node_output_layout(node.get_output_layout())
, _impl_params(node.get_kernel_impl_params())
, _impl(node.get_selected_impl() ? node.get_selected_impl()->clone() : nullptr)
, _outputs({memory::ptr()})

View File

@@ -135,6 +135,7 @@ std::shared_ptr<cldnn::network> Graph::BuildNetwork(std::shared_ptr<cldnn::progr
Graph::variable_states_map Graph::AllocateVariablesMemories() {
Graph::variable_states_map states {};
const auto& memStatesInfo = m_program->GetVariablesStatesInfo();
OPENVINO_ASSERT(memStatesInfo.empty() || !GetNetwork()->is_dynamic(), "[GPU] Dynamic shapes are not supported yet for stateful models");
for (const auto& memStateInfo : memStatesInfo) {
std::vector<cldnn::layout> orderedLayouts {memStateInfo.second.begin(), memStateInfo.second.end()};
std::sort(orderedLayouts.begin(), orderedLayouts.end(), [](cldnn::layout& first, cldnn::layout& second) {

View File

@@ -16,6 +16,7 @@
#include "intel_gpu/runtime/debug_configuration.hpp"
#include "openvino/core/preprocess/input_tensor_info.hpp"
#include <ie_algorithm.hpp>
#include "ie_ngraph_utils.hpp"
#include <debug.h>
using namespace InferenceEngine;
@@ -158,15 +159,9 @@ Blob::Ptr InferRequest::GetBlob(const std::string& name) {
bool isDynamic = (node && node->get_output_partial_shape(0).is_dynamic());
if (is_input) {
// ROI blob is returned only if it was set previously. Otherwise default blob is returned.
auto it = _preProcData.find(name);
if (it != _preProcData.end()) {
data = it->second->getRoiBlob();
} else {
data = _inputs[name];
if (!isDynamic)
checkInputBlob(data, name, foundInput);
}
data = _inputs[name];
if (!isDynamic)
checkInputBlob(data, name, foundInput);
} else {
data = _outputs[name];
if (!isDynamic) {
@@ -390,11 +385,9 @@ void InferRequest::SetGraph(std::shared_ptr<Graph> graph) {
IE_THROW(NetworkNotLoaded);
}
if (!m_graph->GetNetwork()->is_dynamic()) {
allocate_inputs();
allocate_outputs();
variables_states_ = m_graph->AllocateVariablesMemories();
}
allocate_inputs();
allocate_outputs();
variables_states_ = m_graph->AllocateVariablesMemories();
}
InferRequest::InferRequest(InputsDataMap networkInputs, OutputsDataMap networkOutputs,
@@ -527,11 +520,13 @@ void InferRequest::wait() {
std::string outputID = outputsMap.empty() ? m_graph->MapOutputName(no.first) : outputsMap.at(no.first);
auto outputMemory = internal_outputs.at(outputID).get_memory();
if (_outputs.find(no.first) == _outputs.end()) {
bool need_output_update = _outputs.find(no.first) == _outputs.end() || _outputs.at(no.first)->byteSize() != outputMemory->size();
if (need_output_update) {
auto node = findOutputByNodeName(no.first);
auto out_partial_shape = node->get_output_partial_shape(0);
size_t out_rank = out_partial_shape.rank().get_length();
auto mem_dims = outputMemory->get_layout().get_shape();
size_t out_rank = out_partial_shape.size();
auto precision = InferenceEngine::Precision::FP32;
auto dims = SizeVector(mem_dims.begin(), mem_dims.end());
if (static_cast<int32_t>(out_rank) < static_cast<int32_t>(dims.size())) {
@@ -554,7 +549,11 @@ void InferRequest::wait() {
};
auto layout = layout_by_rank(out_rank);
auto tensorDesc = InferenceEngine::TensorDesc(precision, dims, layout);
_outputs[no.first] = create_host_blob(tensorDesc);
if (_outputs.find(no.first) == _outputs.end()) {
_outputs[no.first] = create_host_blob(tensorDesc);
} else {
_outputs[no.first]->setShape(dims);
}
}
Blob::Ptr bptr = _outputs[no.first];
@@ -593,9 +592,13 @@ void InferRequest::setup_stream_graph() {
m_graph = streamGraphs[streamID];
}
Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc, std::shared_ptr<InferenceEngine::IAllocator> alloc) {
Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc) {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::create_host_blob");
auto blob = make_blob_with_precision(desc, alloc ? alloc : CreateDefaultAllocator());
// Disable USM usage as USMHostAllocator may fail for attempt to allocate 0 bytes
// If we add WA for such case to avoid driver call, then deallocate method will return false and Blob::setShape call will throw an exception
bool use_usm = m_graph->GetEngine()->use_unified_shared_memory() && !m_graph->GetNetwork()->is_dynamic();
auto alloc = use_usm ? std::make_shared<USMHostAllocator>(m_graph->GetContext().get()) : CreateDefaultAllocator();
auto blob = make_blob_with_precision(desc, alloc);
blob->allocate();
return blob;
}
@@ -715,6 +718,8 @@ void InferRequest::allocate_inputs() {
IE_THROW() << "Input layout for " << name << " is not found";
}
auto input_layout = litr->second;
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << name << ": input blob]" << std::endl;
@@ -722,24 +727,20 @@ void InferRequest::allocate_inputs() {
if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) {
TensorDesc desc_fp32 = desc;
desc_fp32.setPrecision(Precision::FP32);
auto blobPtr = create_device_blob(desc_fp32);
_deviceInputs[name] = blobPtr;
Blob::Ptr inputBlob = create_host_blob(desc);
_inputs[name] = inputBlob;
_inputs[name] = create_host_blob(desc);
if (input_layout.is_static())
_deviceInputs[name] = create_device_blob(desc_fp32);
} else {
if (m_graph->GetEngine()->use_unified_shared_memory()) {
// For USM case we create host blob using custom USM host allocator
// and then create shared device blob on top of this buffer
if (_inputs.find(name) == _inputs.end()) {
auto host_blob = create_host_blob(desc, std::make_shared<USMHostAllocator>(m_graph->GetContext().get()));
_inputs[name] = host_blob;
_deviceInputs[name] = create_shared_device_blob(desc, litr->second, host_blob->buffer().as<void*>());
_inputs[name] = create_host_blob(desc);
if (input_layout.is_static()) {
if (m_graph->GetEngine()->use_unified_shared_memory()) {
// For USM case we create host blob using custom USM host allocator
// and then create shared device blob on top of this buffer
auto host_blob = _inputs[name];
_deviceInputs[name] = create_shared_device_blob(desc, input_layout, host_blob->buffer().as<void*>());
} else {
_deviceInputs[name] = create_device_blob(desc);
}
} else {
_inputs[name] = create_host_blob(desc);
_deviceInputs[name] = create_device_blob(desc);
}
}
}
@@ -748,15 +749,17 @@ void InferRequest::allocate_inputs() {
void InferRequest::allocate_outputs() {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::allocate_outputs");
// allocate outputs
for (auto& no : _networkOutputs) {
std::string outputID = m_graph->MapOutputName(no.first);
const cldnn::layout output_layout = m_graph->GetNetwork()->get_output_memory(outputID)->get_layout();
const cldnn::layout output_layout = m_graph->GetNetwork()->get_node_output_layout(outputID);
TensorDesc desc = no.second->getTensorDesc();
// Due to some reason TensorDesc in InferRequest contains wrong dims
// while ExecutableNetwork contains proper ones. Thus replace dims with once from exec network
// Can be removed once 76176 is resolved.
desc.setDims(m_graph->GetOutputSize(no.first));
if (output_layout.is_static())
desc.setDims(m_graph->GetOutputSize(no.first));
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
@@ -774,20 +777,20 @@ void InferRequest::allocate_outputs() {
else
device_blob_desc.setPrecision(Precision::FP32);
auto host_blob = create_host_blob(desc);
_outputs[no.first] = host_blob;
auto device_blob = create_device_blob(device_blob_desc);
_deviceOutputs[no.first] = device_blob;
_outputs[no.first] = create_host_blob(desc);
if (output_layout.is_static())
_deviceOutputs[no.first] = create_device_blob(device_blob_desc);
} else {
if (m_graph->GetEngine()->use_unified_shared_memory()) {
// For USM case we create host blob using custom USM host allocator
// and then create shared device blob on top of this buffer
auto host_blob = create_host_blob(desc, std::make_shared<USMHostAllocator>(m_graph->GetContext().get()));
_outputs[no.first] = host_blob;
_deviceOutputs[no.first] = create_shared_device_blob(desc, output_layout, host_blob->buffer().as<void*>());
} else {
_outputs[no.first] = create_host_blob(desc);
_deviceOutputs[no.first] = create_device_blob(desc);
_outputs[no.first] = create_host_blob(desc);
if (output_layout.is_static()) {
if (m_graph->GetEngine()->use_unified_shared_memory()) {
// For USM case we create host blob using custom USM host allocator
// and then create shared device blob on top of this buffer
auto host_blob = _outputs[no.first];
_deviceOutputs[no.first] = create_shared_device_blob(desc, output_layout, host_blob->buffer().as<void*>());
} else {
_deviceOutputs[no.first] = create_device_blob(desc);
}
}
}
}
@@ -817,6 +820,24 @@ void InferRequest::prepare_input(const cldnn::primitive_id& inputName, Blob::Ptr
if (inputLayoutItr == m_graph->GetInputLayouts().end()) {
IE_THROW() << "Input name mismatch.";
}
auto input_layout = inputLayoutItr->second;
if (input_layout.is_dynamic()) {
bool has_device_blob = _deviceInputs.find(inputName) != _deviceInputs.end();
bool should_allocate_device_blob = !has_device_blob;
if (has_device_blob) {
auto device_blob = _deviceInputs.at(inputName);
if (device_blob->byteSize() < inputBlob->byteSize()) {
should_allocate_device_blob = true;
}
}
if (should_allocate_device_blob) {
_deviceInputs[inputName] = create_device_blob(inputBlob->getTensorDesc());
} else {
_deviceInputs[inputName] = reinterpret_device_blob(_deviceInputs[inputName], inputBlob->getTensorDesc());
}
}
OPENVINO_ASSERT(_deviceInputs.find(inputName) != _deviceInputs.end(), "[GPU] Couldn't find device blob allocated for ", inputName, " input");
auto reqBlob = _deviceInputs.at(inputName)->as<gpu::ClBlob>();
auto _nw_ptr = m_graph->GetNetwork();
cldnn::primitive_id internalName = "parameter:" + inputName;
@@ -848,7 +869,7 @@ void InferRequest::prepare_input(const cldnn::primitive_id& inputName, Blob::Ptr
auto input_layout = m_graph->GetInputLayouts().find(inputName);
if (input_layout != m_graph->GetInputLayouts().end()) {
if (input_layout->second.format != inputMem->get_layout().format) {
if (input_layout->second.format != inputMem->get_layout().format && input_layout->second.is_static()) {
inputMem = m_graph->GetNetwork()->get_engine().reinterpret_buffer(*inputMem, input_layout->second);
}
}
@@ -891,6 +912,9 @@ void InferRequest::prepare_input(const cldnn::primitive_id& inputName, Blob::Ptr
void InferRequest::prepare_output(const cldnn::primitive_id& outputName, Blob::Ptr& outputBlob) {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::prepare_output");
// Missing output in _deviceOutputs means that the network is dynamic and outputs couldn't be pre-allocated
if (_deviceOutputs.find(outputName) == _deviceOutputs.end())
return;
Blob::Ptr reqBlob = _deviceOutputs.at(outputName);
cldnn::primitive_id internalName = outputsMap[outputName];
auto _nw_ptr = m_graph->GetNetwork();
@@ -921,7 +945,7 @@ InferenceEngine::Blob::Ptr InferRequest::create_device_blob(const InferenceEngin
nullptr,
0,
0,
RemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL);
RemoteBlobImpl::BlobType::BT_USM_DEVICE_INTERNAL);
getBlobImpl(blobPtr.get())->allocate();
return blobPtr;
} else {

View File

@@ -28,12 +28,14 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
auto inputInfo = networkInputs.at(op->get_friendly_name());
// first create and add the input layout
const auto inputDesc = inputInfo->getTensorDesc();
auto inputDims = op->get_partial_shape();
auto input_pshape = op->get_partial_shape();
InferenceEngine::Layout l = inputDesc.getLayout();
InferenceEngine::Precision ip = inputDesc.getPrecision();
cldnn::format inputFormat = cldnn::format::bfyx;
if (InferenceEngine::Layout::BLOCKED == l && 6 == inputDims.size()) {
if (input_pshape.is_dynamic()) {
inputFormat = cldnn::format::get_default_format(input_pshape.size());
} else if (InferenceEngine::Layout::BLOCKED == l && 6 == input_pshape.size()) {
inputFormat = cldnn::format::bfwzyx;
} else {
inputFormat = FormatFromLayout(l);
@@ -43,7 +45,7 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
auto inputName = layer_type_name_ID(op);
auto preProcess = inputInfo->getPreProcess();
size_t meanChannels = preProcess.getNumberOfChannels();
cldnn::layout networkInputLayout(inputDims,
cldnn::layout networkInputLayout(input_pshape,
cldnn::element_type_to_data_type(op->get_output_element_type(0)),
inputFormat);
cldnn::primitive_id meanBlobID = inputName + Program::m_meanValuesTag;
@@ -70,7 +72,7 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
IE_ASSERT(meanChannels);
// first merge all mean values to a single blob
// todo make sure mean blob precision is the same as the input precision
auto meanDims = inputDims;
auto meanDims = input_pshape;
// overwrite batches with 1
switch (meanDims.size()) {
case 4: meanDims[0] = 1;
@@ -155,10 +157,10 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
}
if (networkInputLayout.format == cldnn::format::nv12 && networkInputLayout.get_tensor().batch[0] > 1) {
networkInputLayout.set_partial_shape({ 1, inputDims[3], inputDims[1], inputDims[2] });
networkInputLayout.set_partial_shape({ 1, input_pshape[3], input_pshape[1], input_pshape[2] });
std::vector<cldnn::primitive_id> inputs;
for (int64_t i = 0; i < inputDims[0].get_length(); ++i) {
for (int64_t i = 0; i < input_pshape[0].get_length(); ++i) {
std::string batched_name = inputName + "_" + std::to_string(i);
p.inputLayouts.insert({ inputInfo->name() + "_" + std::to_string(i), networkInputLayout });
inputs.emplace_back(batched_name);
@@ -166,7 +168,7 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
}
p.primitive_ids[inputName] = inputName;
} else {
networkInputLayout.set_partial_shape({ inputDims[0], inputDims[3], inputDims[1], inputDims[2] });
networkInputLayout.set_partial_shape({ input_pshape[0], input_pshape[3], input_pshape[1], input_pshape[2] });
p.inputLayouts.insert({ inputInfo->name(), networkInputLayout });
p.add_primitive(*op, cldnn::input_layout(inputName, networkInputLayout));
@@ -180,9 +182,9 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
IE_THROW() << "Unsupported layout (" << l << ") or precision "
<< ip.name() << ") for NV12 input " + inputInfo->name();
}
int height = inputDims[2].get_length();
int width = inputDims[3].get_length();
size_t batch = inputDims[0].get_length();
int height = input_pshape[2].get_length();
int width = input_pshape[3].get_length();
size_t batch = input_pshape[0].get_length();
std::vector<cldnn::primitive_id> reorders;
for (size_t i = 0; i < batch; i++) {
@@ -228,7 +230,7 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
reorders.push_back(preprocessPrimID);
}
if (inputDims[0].get_length() > 1) {
if (input_pshape[0].get_length() > 1) {
auto concatPrimID = "concat:" + inputName + Program::m_preProcessTag;
p.add_primitive(*op, cldnn::concatenation(concatPrimID, reorders, 0));
}

View File

@@ -66,9 +66,7 @@ static void CreateResultOp(Program& p, const std::shared_ptr<ngraph::op::v0::Res
auto reorder_primitive = cldnn::reorder(outLayerName,
outputID,
FormatFromLayout(outputlayout),
DataTypeFromPrecision(precision),
std::vector<float>(),
cldnn::reorder_mean_mode::subtract);
DataTypeFromPrecision(precision));
p.add_primitive(*op, reorder_primitive, {originalOutName});
p.outputDims[originalOutName] = outputDesc.getDims();
p.prevPrimitiveIDs[outLayerName] = {originalOutName};

View File

@@ -261,8 +261,7 @@ static void CreateLSTMSequenceOp(Program& p, const std::shared_ptr<ngraph::op::v
hiddenStr = crop_id + ":hidden";
cellStr = crop_id + ":cell";
cldnn::primitive_id outputHiddenID = layerName + ".out1";
p.add_primitive(*op, cldnn::crop(hiddenStr, lstm_elt_id, hiddenSz, cldnn::tensor{ 0, 0, 0, 0 }), {outputHiddenID});
p.add_primitive(*op, cldnn::crop(hiddenStr, lstm_elt_id, hiddenSz, cldnn::tensor{ 0, 0, 0, 0 }));
output_ids_offsets.push_back(hiddenStr);
if (i < lstm_sequence_len - 1) {
@@ -271,16 +270,18 @@ static void CreateLSTMSequenceOp(Program& p, const std::shared_ptr<ngraph::op::v
// last hidden state crop (output 2)
// last cell state crop (output 3)
cldnn::primitive_id outputCellID = layerName + ".out2";
p.add_primitive(*op, cldnn::crop(cellStr, lstm_elt_id, hiddenSz, cellCropSz), {outputCellID});
p.add_primitive(*op, cldnn::crop(cellStr, lstm_elt_id, hiddenSz, cellCropSz));
}
}
if (!isForward) std::reverse(output_ids_offsets.begin(), output_ids_offsets.end());
// concatenated hidden state (output 1)
cldnn::primitive_id outputConcatID = layerName + ".out0";
cldnn::primitive_id concatStr = layerName + ":hiddenConcat";
p.add_primitive(*op, cldnn::concatenation(concatStr, output_ids_offsets, 1), {outputConcatID, layerName});
p.add_primitive(*op, cldnn::concatenation(concatStr, output_ids_offsets, 1));
p.add_primitive(*op, cldnn::reshape(layerName + ".out0", concatStr, tensor_from_dims(op->get_output_shape(0))), {layerName});
p.add_primitive(*op, cldnn::reshape(layerName + ".out1", hiddenStr, tensor_from_dims(op->get_output_shape(1))));
p.add_primitive(*op, cldnn::reshape(layerName + ".out2", cellStr, tensor_from_dims(op->get_output_shape(2))));
}
REGISTER_FACTORY_IMPL(v4, LSTMCell);