From ef12d3976fe948d5c9d45ab2aa7cbe96a8663bb1 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Wed, 19 Oct 2022 09:35:03 +0400 Subject: [PATCH] [GPU] Fixes for infer request impl in dynamic cases (#13050) * [GPU] Fixes for infer request impl in dynamic cases * [GPU] Fixed incosistent output shapes for LSTMSequence op * [GPU] Update network::get_output_layout method * [GPU] WA for USM memory allocations with 0 bytes --- .../include/intel_gpu/graph/network.hpp | 2 + .../intel_gpu/plugin/infer_request.hpp | 3 +- .../src/graph/include/primitive_inst.h | 3 + src/plugins/intel_gpu/src/graph/network.cpp | 14 ++ .../intel_gpu/src/graph/primitive_inst.cpp | 1 + src/plugins/intel_gpu/src/plugin/graph.cpp | 1 + .../intel_gpu/src/plugin/infer_request.cpp | 124 +++++++++++------- .../intel_gpu/src/plugin/ops/parameter.cpp | 24 ++-- .../intel_gpu/src/plugin/ops/result.cpp | 4 +- src/plugins/intel_gpu/src/plugin/ops/rnn.cpp | 13 +- .../skip_tests_config.cpp | 10 +- 11 files changed, 123 insertions(+), 76 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp index 2fbb0d19354..290b8f02903 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp @@ -121,6 +121,7 @@ public: } memory::ptr get_output_memory(const primitive_id& output_id); + layout get_node_output_layout(const primitive_id& output_id) const; /// @brief Returns the list of primitive ids before and after graph optimization. /// @details If primitive was not optimized, the old and actual id will be the same. @@ -178,6 +179,7 @@ public: void set_arguments(); // Implementation specific calls std::shared_ptr get_primitive(const primitive_id& id); + std::shared_ptr get_primitive(const primitive_id& id) const; std::string get_primitive_info(const primitive_id& id) const; std::string get_implementation_info(const primitive_id& id) const; const event::ptr& get_primitive_event(const primitive_id& id) const { return _events.at(id); } diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp index 16660d9bc91..71fe2fccd04 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp @@ -74,8 +74,7 @@ private: std::vector& dependencies); void prepare_output(const cldnn::primitive_id& outputName, InferenceEngine::Blob::Ptr& outputBlob); - InferenceEngine::Blob::Ptr create_host_blob(const InferenceEngine::TensorDesc& desc, - std::shared_ptr alloc = nullptr); + InferenceEngine::Blob::Ptr create_host_blob(const InferenceEngine::TensorDesc& desc); InferenceEngine::Blob::Ptr create_device_blob(const InferenceEngine::TensorDesc& desc); void copy_output_data(cldnn::memory::ptr outputMemory, InferenceEngine::Blob::Ptr bptr); diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h index 8ff9bbd388f..1563c18bf57 100644 --- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h @@ -195,11 +195,14 @@ public: const std::unordered_map>& get_profiling_data() const { return _profiling_data; } const std::unordered_map& get_profiling_info() const { return _profiling_info; } + layout get_node_output_layout() const { return _node_output_layout; } + protected: primitive_inst(network& network, program_node const& node, bool allocate_memory); network& _network; program_node const& _node; + const layout _node_output_layout; std::unique_ptr _impl_params; std::unique_ptr _impl; diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 170c3984578..25f23559d60 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -587,6 +587,15 @@ memory::ptr network::get_output_memory(const primitive_id& output_id) { return get_primitive(output_id)->output_memory_ptr(); } +layout network::get_node_output_layout(const primitive_id& output_id) const { + auto res = std::find_if(_outputs.begin(), _outputs.end(), [&](const std::shared_ptr& v) { + return v->id() == output_id; + }); + OPENVINO_ASSERT(res != _outputs.end(), "[GPU] Couldn't get output layout for ", output_id, ". Output with such name is not found in the outputs list"); + + return (*res)->get_node_output_layout(); +} + void network::allocate_primitives() { std::vector> nodes_to_allocate{}; auto& po = _program->get_processing_order(); @@ -887,6 +896,11 @@ std::shared_ptr network::get_primitive(const primitive_id& id) { return _primitives.at(id); } +std::shared_ptr network::get_primitive(const primitive_id& id) const { + OPENVINO_ASSERT(_primitives.count(id) == 1, "[GPU] Can't get primitive with ", id, " id: primitive with such name hasn't been found in processing order"); + return _primitives.at(id); +} + std::vector> network::get_primitives(const std::vector& ids) { std::vector> result(ids.size()); std::transform(std::begin(ids), std::end(ids), std::begin(result), [&](const primitive_id& id) { diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index bb62ebe24f0..436fcc13d13 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -438,6 +438,7 @@ void primitive_inst::build_deps() { primitive_inst::primitive_inst(network& network, program_node const& node, bool allocate_memory) : _network(network) , _node(node) + , _node_output_layout(node.get_output_layout()) , _impl_params(node.get_kernel_impl_params()) , _impl(node.get_selected_impl() ? node.get_selected_impl()->clone() : nullptr) , _outputs({memory::ptr()}) diff --git a/src/plugins/intel_gpu/src/plugin/graph.cpp b/src/plugins/intel_gpu/src/plugin/graph.cpp index 025a8e453ad..2275ed51525 100644 --- a/src/plugins/intel_gpu/src/plugin/graph.cpp +++ b/src/plugins/intel_gpu/src/plugin/graph.cpp @@ -135,6 +135,7 @@ std::shared_ptr Graph::BuildNetwork(std::shared_ptrGetVariablesStatesInfo(); + OPENVINO_ASSERT(memStatesInfo.empty() || !GetNetwork()->is_dynamic(), "[GPU] Dynamic shapes are not supported yet for stateful models"); for (const auto& memStateInfo : memStatesInfo) { std::vector orderedLayouts {memStateInfo.second.begin(), memStateInfo.second.end()}; std::sort(orderedLayouts.begin(), orderedLayouts.end(), [](cldnn::layout& first, cldnn::layout& second) { diff --git a/src/plugins/intel_gpu/src/plugin/infer_request.cpp b/src/plugins/intel_gpu/src/plugin/infer_request.cpp index 7799609f53b..e681ca70fc3 100644 --- a/src/plugins/intel_gpu/src/plugin/infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/infer_request.cpp @@ -16,6 +16,7 @@ #include "intel_gpu/runtime/debug_configuration.hpp" #include "openvino/core/preprocess/input_tensor_info.hpp" #include +#include "ie_ngraph_utils.hpp" #include using namespace InferenceEngine; @@ -158,15 +159,9 @@ Blob::Ptr InferRequest::GetBlob(const std::string& name) { bool isDynamic = (node && node->get_output_partial_shape(0).is_dynamic()); if (is_input) { - // ROI blob is returned only if it was set previously. Otherwise default blob is returned. - auto it = _preProcData.find(name); - if (it != _preProcData.end()) { - data = it->second->getRoiBlob(); - } else { - data = _inputs[name]; - if (!isDynamic) - checkInputBlob(data, name, foundInput); - } + data = _inputs[name]; + if (!isDynamic) + checkInputBlob(data, name, foundInput); } else { data = _outputs[name]; if (!isDynamic) { @@ -390,11 +385,9 @@ void InferRequest::SetGraph(std::shared_ptr graph) { IE_THROW(NetworkNotLoaded); } - if (!m_graph->GetNetwork()->is_dynamic()) { - allocate_inputs(); - allocate_outputs(); - variables_states_ = m_graph->AllocateVariablesMemories(); - } + allocate_inputs(); + allocate_outputs(); + variables_states_ = m_graph->AllocateVariablesMemories(); } InferRequest::InferRequest(InputsDataMap networkInputs, OutputsDataMap networkOutputs, @@ -527,11 +520,13 @@ void InferRequest::wait() { std::string outputID = outputsMap.empty() ? m_graph->MapOutputName(no.first) : outputsMap.at(no.first); auto outputMemory = internal_outputs.at(outputID).get_memory(); - if (_outputs.find(no.first) == _outputs.end()) { + bool need_output_update = _outputs.find(no.first) == _outputs.end() || _outputs.at(no.first)->byteSize() != outputMemory->size(); + + if (need_output_update) { auto node = findOutputByNodeName(no.first); auto out_partial_shape = node->get_output_partial_shape(0); - size_t out_rank = out_partial_shape.rank().get_length(); auto mem_dims = outputMemory->get_layout().get_shape(); + size_t out_rank = out_partial_shape.size(); auto precision = InferenceEngine::Precision::FP32; auto dims = SizeVector(mem_dims.begin(), mem_dims.end()); if (static_cast(out_rank) < static_cast(dims.size())) { @@ -554,7 +549,11 @@ void InferRequest::wait() { }; auto layout = layout_by_rank(out_rank); auto tensorDesc = InferenceEngine::TensorDesc(precision, dims, layout); - _outputs[no.first] = create_host_blob(tensorDesc); + if (_outputs.find(no.first) == _outputs.end()) { + _outputs[no.first] = create_host_blob(tensorDesc); + } else { + _outputs[no.first]->setShape(dims); + } } Blob::Ptr bptr = _outputs[no.first]; @@ -593,9 +592,13 @@ void InferRequest::setup_stream_graph() { m_graph = streamGraphs[streamID]; } -Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc, std::shared_ptr alloc) { +Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc) { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::create_host_blob"); - auto blob = make_blob_with_precision(desc, alloc ? alloc : CreateDefaultAllocator()); + // Disable USM usage as USMHostAllocator may fail for attempt to allocate 0 bytes + // If we add WA for such case to avoid driver call, then deallocate method will return false and Blob::setShape call will throw an exception + bool use_usm = m_graph->GetEngine()->use_unified_shared_memory() && !m_graph->GetNetwork()->is_dynamic(); + auto alloc = use_usm ? std::make_shared(m_graph->GetContext().get()) : CreateDefaultAllocator(); + auto blob = make_blob_with_precision(desc, alloc); blob->allocate(); return blob; } @@ -715,6 +718,8 @@ void InferRequest::allocate_inputs() { IE_THROW() << "Input layout for " << name << " is not found"; } + auto input_layout = litr->second; + GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_IF(debug_config->verbose >= 2) { GPU_DEBUG_COUT << "[" << name << ": input blob]" << std::endl; @@ -722,24 +727,20 @@ void InferRequest::allocate_inputs() { if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) { TensorDesc desc_fp32 = desc; desc_fp32.setPrecision(Precision::FP32); - auto blobPtr = create_device_blob(desc_fp32); - _deviceInputs[name] = blobPtr; - Blob::Ptr inputBlob = create_host_blob(desc); - _inputs[name] = inputBlob; + _inputs[name] = create_host_blob(desc); + if (input_layout.is_static()) + _deviceInputs[name] = create_device_blob(desc_fp32); } else { - if (m_graph->GetEngine()->use_unified_shared_memory()) { - // For USM case we create host blob using custom USM host allocator - // and then create shared device blob on top of this buffer - if (_inputs.find(name) == _inputs.end()) { - auto host_blob = create_host_blob(desc, std::make_shared(m_graph->GetContext().get())); - _inputs[name] = host_blob; - _deviceInputs[name] = create_shared_device_blob(desc, litr->second, host_blob->buffer().as()); + _inputs[name] = create_host_blob(desc); + if (input_layout.is_static()) { + if (m_graph->GetEngine()->use_unified_shared_memory()) { + // For USM case we create host blob using custom USM host allocator + // and then create shared device blob on top of this buffer + auto host_blob = _inputs[name]; + _deviceInputs[name] = create_shared_device_blob(desc, input_layout, host_blob->buffer().as()); } else { _deviceInputs[name] = create_device_blob(desc); } - } else { - _inputs[name] = create_host_blob(desc); - _deviceInputs[name] = create_device_blob(desc); } } } @@ -748,15 +749,17 @@ void InferRequest::allocate_inputs() { void InferRequest::allocate_outputs() { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::allocate_outputs"); + // allocate outputs for (auto& no : _networkOutputs) { std::string outputID = m_graph->MapOutputName(no.first); - const cldnn::layout output_layout = m_graph->GetNetwork()->get_output_memory(outputID)->get_layout(); + const cldnn::layout output_layout = m_graph->GetNetwork()->get_node_output_layout(outputID); TensorDesc desc = no.second->getTensorDesc(); // Due to some reason TensorDesc in InferRequest contains wrong dims // while ExecutableNetwork contains proper ones. Thus replace dims with once from exec network // Can be removed once 76176 is resolved. - desc.setDims(m_graph->GetOutputSize(no.first)); + if (output_layout.is_static()) + desc.setDims(m_graph->GetOutputSize(no.first)); GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_IF(debug_config->verbose >= 2) { @@ -774,20 +777,20 @@ void InferRequest::allocate_outputs() { else device_blob_desc.setPrecision(Precision::FP32); - auto host_blob = create_host_blob(desc); - _outputs[no.first] = host_blob; - auto device_blob = create_device_blob(device_blob_desc); - _deviceOutputs[no.first] = device_blob; + _outputs[no.first] = create_host_blob(desc); + if (output_layout.is_static()) + _deviceOutputs[no.first] = create_device_blob(device_blob_desc); } else { - if (m_graph->GetEngine()->use_unified_shared_memory()) { - // For USM case we create host blob using custom USM host allocator - // and then create shared device blob on top of this buffer - auto host_blob = create_host_blob(desc, std::make_shared(m_graph->GetContext().get())); - _outputs[no.first] = host_blob; - _deviceOutputs[no.first] = create_shared_device_blob(desc, output_layout, host_blob->buffer().as()); - } else { - _outputs[no.first] = create_host_blob(desc); - _deviceOutputs[no.first] = create_device_blob(desc); + _outputs[no.first] = create_host_blob(desc); + if (output_layout.is_static()) { + if (m_graph->GetEngine()->use_unified_shared_memory()) { + // For USM case we create host blob using custom USM host allocator + // and then create shared device blob on top of this buffer + auto host_blob = _outputs[no.first]; + _deviceOutputs[no.first] = create_shared_device_blob(desc, output_layout, host_blob->buffer().as()); + } else { + _deviceOutputs[no.first] = create_device_blob(desc); + } } } } @@ -817,6 +820,24 @@ void InferRequest::prepare_input(const cldnn::primitive_id& inputName, Blob::Ptr if (inputLayoutItr == m_graph->GetInputLayouts().end()) { IE_THROW() << "Input name mismatch."; } + auto input_layout = inputLayoutItr->second; + if (input_layout.is_dynamic()) { + bool has_device_blob = _deviceInputs.find(inputName) != _deviceInputs.end(); + bool should_allocate_device_blob = !has_device_blob; + if (has_device_blob) { + auto device_blob = _deviceInputs.at(inputName); + if (device_blob->byteSize() < inputBlob->byteSize()) { + should_allocate_device_blob = true; + } + } + + if (should_allocate_device_blob) { + _deviceInputs[inputName] = create_device_blob(inputBlob->getTensorDesc()); + } else { + _deviceInputs[inputName] = reinterpret_device_blob(_deviceInputs[inputName], inputBlob->getTensorDesc()); + } + } + OPENVINO_ASSERT(_deviceInputs.find(inputName) != _deviceInputs.end(), "[GPU] Couldn't find device blob allocated for ", inputName, " input"); auto reqBlob = _deviceInputs.at(inputName)->as(); auto _nw_ptr = m_graph->GetNetwork(); cldnn::primitive_id internalName = "parameter:" + inputName; @@ -848,7 +869,7 @@ void InferRequest::prepare_input(const cldnn::primitive_id& inputName, Blob::Ptr auto input_layout = m_graph->GetInputLayouts().find(inputName); if (input_layout != m_graph->GetInputLayouts().end()) { - if (input_layout->second.format != inputMem->get_layout().format) { + if (input_layout->second.format != inputMem->get_layout().format && input_layout->second.is_static()) { inputMem = m_graph->GetNetwork()->get_engine().reinterpret_buffer(*inputMem, input_layout->second); } } @@ -891,6 +912,9 @@ void InferRequest::prepare_input(const cldnn::primitive_id& inputName, Blob::Ptr void InferRequest::prepare_output(const cldnn::primitive_id& outputName, Blob::Ptr& outputBlob) { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::prepare_output"); + // Missing output in _deviceOutputs means that the network is dynamic and outputs couldn't be pre-allocated + if (_deviceOutputs.find(outputName) == _deviceOutputs.end()) + return; Blob::Ptr reqBlob = _deviceOutputs.at(outputName); cldnn::primitive_id internalName = outputsMap[outputName]; auto _nw_ptr = m_graph->GetNetwork(); @@ -921,7 +945,7 @@ InferenceEngine::Blob::Ptr InferRequest::create_device_blob(const InferenceEngin nullptr, 0, 0, - RemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL); + RemoteBlobImpl::BlobType::BT_USM_DEVICE_INTERNAL); getBlobImpl(blobPtr.get())->allocate(); return blobPtr; } else { diff --git a/src/plugins/intel_gpu/src/plugin/ops/parameter.cpp b/src/plugins/intel_gpu/src/plugin/ops/parameter.cpp index e7615a66c46..9c6591e74dc 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/parameter.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/parameter.cpp @@ -28,12 +28,14 @@ static void CreateParameterOp(Program& p, const std::shared_ptrget_friendly_name()); // first create and add the input layout const auto inputDesc = inputInfo->getTensorDesc(); - auto inputDims = op->get_partial_shape(); + auto input_pshape = op->get_partial_shape(); InferenceEngine::Layout l = inputDesc.getLayout(); InferenceEngine::Precision ip = inputDesc.getPrecision(); cldnn::format inputFormat = cldnn::format::bfyx; - if (InferenceEngine::Layout::BLOCKED == l && 6 == inputDims.size()) { + if (input_pshape.is_dynamic()) { + inputFormat = cldnn::format::get_default_format(input_pshape.size()); + } else if (InferenceEngine::Layout::BLOCKED == l && 6 == input_pshape.size()) { inputFormat = cldnn::format::bfwzyx; } else { inputFormat = FormatFromLayout(l); @@ -43,7 +45,7 @@ static void CreateParameterOp(Program& p, const std::shared_ptrgetPreProcess(); size_t meanChannels = preProcess.getNumberOfChannels(); - cldnn::layout networkInputLayout(inputDims, + cldnn::layout networkInputLayout(input_pshape, cldnn::element_type_to_data_type(op->get_output_element_type(0)), inputFormat); cldnn::primitive_id meanBlobID = inputName + Program::m_meanValuesTag; @@ -70,7 +72,7 @@ static void CreateParameterOp(Program& p, const std::shared_ptr 1) { - networkInputLayout.set_partial_shape({ 1, inputDims[3], inputDims[1], inputDims[2] }); + networkInputLayout.set_partial_shape({ 1, input_pshape[3], input_pshape[1], input_pshape[2] }); std::vector inputs; - for (int64_t i = 0; i < inputDims[0].get_length(); ++i) { + for (int64_t i = 0; i < input_pshape[0].get_length(); ++i) { std::string batched_name = inputName + "_" + std::to_string(i); p.inputLayouts.insert({ inputInfo->name() + "_" + std::to_string(i), networkInputLayout }); inputs.emplace_back(batched_name); @@ -166,7 +168,7 @@ static void CreateParameterOp(Program& p, const std::shared_ptrname(), networkInputLayout }); p.add_primitive(*op, cldnn::input_layout(inputName, networkInputLayout)); @@ -180,9 +182,9 @@ static void CreateParameterOp(Program& p, const std::shared_ptrname(); } - int height = inputDims[2].get_length(); - int width = inputDims[3].get_length(); - size_t batch = inputDims[0].get_length(); + int height = input_pshape[2].get_length(); + int width = input_pshape[3].get_length(); + size_t batch = input_pshape[0].get_length(); std::vector reorders; for (size_t i = 0; i < batch; i++) { @@ -228,7 +230,7 @@ static void CreateParameterOp(Program& p, const std::shared_ptr 1) { + if (input_pshape[0].get_length() > 1) { auto concatPrimID = "concat:" + inputName + Program::m_preProcessTag; p.add_primitive(*op, cldnn::concatenation(concatPrimID, reorders, 0)); } diff --git a/src/plugins/intel_gpu/src/plugin/ops/result.cpp b/src/plugins/intel_gpu/src/plugin/ops/result.cpp index 01cb830b712..b5ba3664a23 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/result.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/result.cpp @@ -66,9 +66,7 @@ static void CreateResultOp(Program& p, const std::shared_ptr(), - cldnn::reorder_mean_mode::subtract); + DataTypeFromPrecision(precision)); p.add_primitive(*op, reorder_primitive, {originalOutName}); p.outputDims[originalOutName] = outputDesc.getDims(); p.prevPrimitiveIDs[outLayerName] = {originalOutName}; diff --git a/src/plugins/intel_gpu/src/plugin/ops/rnn.cpp b/src/plugins/intel_gpu/src/plugin/ops/rnn.cpp index 08cf9a8709a..602741aff07 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/rnn.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/rnn.cpp @@ -261,8 +261,7 @@ static void CreateLSTMSequenceOp(Program& p, const std::shared_ptr