[GPU] Fixes for infer request impl in dynamic cases (#13050)
* [GPU] Fixes for infer request impl in dynamic cases * [GPU] Fixed incosistent output shapes for LSTMSequence op * [GPU] Update network::get_output_layout method * [GPU] WA for USM memory allocations with 0 bytes
This commit is contained in:
committed by
GitHub
parent
dc1fe22fed
commit
ef12d3976f
@@ -121,6 +121,7 @@ public:
|
||||
}
|
||||
|
||||
memory::ptr get_output_memory(const primitive_id& output_id);
|
||||
layout get_node_output_layout(const primitive_id& output_id) const;
|
||||
|
||||
/// @brief Returns the list of primitive ids before and after graph optimization.
|
||||
/// @details If primitive was not optimized, the old and actual id will be the same.
|
||||
@@ -178,6 +179,7 @@ public:
|
||||
void set_arguments();
|
||||
// Implementation specific calls
|
||||
std::shared_ptr<primitive_inst> get_primitive(const primitive_id& id);
|
||||
std::shared_ptr<const primitive_inst> get_primitive(const primitive_id& id) const;
|
||||
std::string get_primitive_info(const primitive_id& id) const;
|
||||
std::string get_implementation_info(const primitive_id& id) const;
|
||||
const event::ptr& get_primitive_event(const primitive_id& id) const { return _events.at(id); }
|
||||
|
||||
@@ -74,8 +74,7 @@ private:
|
||||
std::vector<cldnn::event::ptr>& dependencies);
|
||||
void prepare_output(const cldnn::primitive_id& outputName, InferenceEngine::Blob::Ptr& outputBlob);
|
||||
|
||||
InferenceEngine::Blob::Ptr create_host_blob(const InferenceEngine::TensorDesc& desc,
|
||||
std::shared_ptr<InferenceEngine::IAllocator> alloc = nullptr);
|
||||
InferenceEngine::Blob::Ptr create_host_blob(const InferenceEngine::TensorDesc& desc);
|
||||
InferenceEngine::Blob::Ptr create_device_blob(const InferenceEngine::TensorDesc& desc);
|
||||
|
||||
void copy_output_data(cldnn::memory::ptr outputMemory, InferenceEngine::Blob::Ptr bptr);
|
||||
|
||||
@@ -195,11 +195,14 @@ public:
|
||||
const std::unordered_map<size_t, std::tuple<int64_t, size_t>>& get_profiling_data() const { return _profiling_data; }
|
||||
const std::unordered_map<size_t, instrumentation::perf_counter_key>& get_profiling_info() const { return _profiling_info; }
|
||||
|
||||
layout get_node_output_layout() const { return _node_output_layout; }
|
||||
|
||||
protected:
|
||||
primitive_inst(network& network, program_node const& node, bool allocate_memory);
|
||||
|
||||
network& _network;
|
||||
program_node const& _node;
|
||||
const layout _node_output_layout;
|
||||
|
||||
std::unique_ptr<kernel_impl_params> _impl_params;
|
||||
std::unique_ptr<primitive_impl> _impl;
|
||||
|
||||
@@ -587,6 +587,15 @@ memory::ptr network::get_output_memory(const primitive_id& output_id) {
|
||||
return get_primitive(output_id)->output_memory_ptr();
|
||||
}
|
||||
|
||||
layout network::get_node_output_layout(const primitive_id& output_id) const {
|
||||
auto res = std::find_if(_outputs.begin(), _outputs.end(), [&](const std::shared_ptr<primitive_inst>& v) {
|
||||
return v->id() == output_id;
|
||||
});
|
||||
OPENVINO_ASSERT(res != _outputs.end(), "[GPU] Couldn't get output layout for ", output_id, ". Output with such name is not found in the outputs list");
|
||||
|
||||
return (*res)->get_node_output_layout();
|
||||
}
|
||||
|
||||
void network::allocate_primitives() {
|
||||
std::vector<std::shared_ptr<program_node>> nodes_to_allocate{};
|
||||
auto& po = _program->get_processing_order();
|
||||
@@ -887,6 +896,11 @@ std::shared_ptr<primitive_inst> network::get_primitive(const primitive_id& id) {
|
||||
return _primitives.at(id);
|
||||
}
|
||||
|
||||
std::shared_ptr<const primitive_inst> network::get_primitive(const primitive_id& id) const {
|
||||
OPENVINO_ASSERT(_primitives.count(id) == 1, "[GPU] Can't get primitive with ", id, " id: primitive with such name hasn't been found in processing order");
|
||||
return _primitives.at(id);
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<primitive_inst>> network::get_primitives(const std::vector<primitive_id>& ids) {
|
||||
std::vector<std::shared_ptr<primitive_inst>> result(ids.size());
|
||||
std::transform(std::begin(ids), std::end(ids), std::begin(result), [&](const primitive_id& id) {
|
||||
|
||||
@@ -438,6 +438,7 @@ void primitive_inst::build_deps() {
|
||||
primitive_inst::primitive_inst(network& network, program_node const& node, bool allocate_memory)
|
||||
: _network(network)
|
||||
, _node(node)
|
||||
, _node_output_layout(node.get_output_layout())
|
||||
, _impl_params(node.get_kernel_impl_params())
|
||||
, _impl(node.get_selected_impl() ? node.get_selected_impl()->clone() : nullptr)
|
||||
, _outputs({memory::ptr()})
|
||||
|
||||
@@ -135,6 +135,7 @@ std::shared_ptr<cldnn::network> Graph::BuildNetwork(std::shared_ptr<cldnn::progr
|
||||
Graph::variable_states_map Graph::AllocateVariablesMemories() {
|
||||
Graph::variable_states_map states {};
|
||||
const auto& memStatesInfo = m_program->GetVariablesStatesInfo();
|
||||
OPENVINO_ASSERT(memStatesInfo.empty() || !GetNetwork()->is_dynamic(), "[GPU] Dynamic shapes are not supported yet for stateful models");
|
||||
for (const auto& memStateInfo : memStatesInfo) {
|
||||
std::vector<cldnn::layout> orderedLayouts {memStateInfo.second.begin(), memStateInfo.second.end()};
|
||||
std::sort(orderedLayouts.begin(), orderedLayouts.end(), [](cldnn::layout& first, cldnn::layout& second) {
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "intel_gpu/runtime/debug_configuration.hpp"
|
||||
#include "openvino/core/preprocess/input_tensor_info.hpp"
|
||||
#include <ie_algorithm.hpp>
|
||||
#include "ie_ngraph_utils.hpp"
|
||||
#include <debug.h>
|
||||
|
||||
using namespace InferenceEngine;
|
||||
@@ -158,15 +159,9 @@ Blob::Ptr InferRequest::GetBlob(const std::string& name) {
|
||||
bool isDynamic = (node && node->get_output_partial_shape(0).is_dynamic());
|
||||
|
||||
if (is_input) {
|
||||
// ROI blob is returned only if it was set previously. Otherwise default blob is returned.
|
||||
auto it = _preProcData.find(name);
|
||||
if (it != _preProcData.end()) {
|
||||
data = it->second->getRoiBlob();
|
||||
} else {
|
||||
data = _inputs[name];
|
||||
if (!isDynamic)
|
||||
checkInputBlob(data, name, foundInput);
|
||||
}
|
||||
data = _inputs[name];
|
||||
if (!isDynamic)
|
||||
checkInputBlob(data, name, foundInput);
|
||||
} else {
|
||||
data = _outputs[name];
|
||||
if (!isDynamic) {
|
||||
@@ -390,11 +385,9 @@ void InferRequest::SetGraph(std::shared_ptr<Graph> graph) {
|
||||
IE_THROW(NetworkNotLoaded);
|
||||
}
|
||||
|
||||
if (!m_graph->GetNetwork()->is_dynamic()) {
|
||||
allocate_inputs();
|
||||
allocate_outputs();
|
||||
variables_states_ = m_graph->AllocateVariablesMemories();
|
||||
}
|
||||
allocate_inputs();
|
||||
allocate_outputs();
|
||||
variables_states_ = m_graph->AllocateVariablesMemories();
|
||||
}
|
||||
|
||||
InferRequest::InferRequest(InputsDataMap networkInputs, OutputsDataMap networkOutputs,
|
||||
@@ -527,11 +520,13 @@ void InferRequest::wait() {
|
||||
std::string outputID = outputsMap.empty() ? m_graph->MapOutputName(no.first) : outputsMap.at(no.first);
|
||||
auto outputMemory = internal_outputs.at(outputID).get_memory();
|
||||
|
||||
if (_outputs.find(no.first) == _outputs.end()) {
|
||||
bool need_output_update = _outputs.find(no.first) == _outputs.end() || _outputs.at(no.first)->byteSize() != outputMemory->size();
|
||||
|
||||
if (need_output_update) {
|
||||
auto node = findOutputByNodeName(no.first);
|
||||
auto out_partial_shape = node->get_output_partial_shape(0);
|
||||
size_t out_rank = out_partial_shape.rank().get_length();
|
||||
auto mem_dims = outputMemory->get_layout().get_shape();
|
||||
size_t out_rank = out_partial_shape.size();
|
||||
auto precision = InferenceEngine::Precision::FP32;
|
||||
auto dims = SizeVector(mem_dims.begin(), mem_dims.end());
|
||||
if (static_cast<int32_t>(out_rank) < static_cast<int32_t>(dims.size())) {
|
||||
@@ -554,7 +549,11 @@ void InferRequest::wait() {
|
||||
};
|
||||
auto layout = layout_by_rank(out_rank);
|
||||
auto tensorDesc = InferenceEngine::TensorDesc(precision, dims, layout);
|
||||
_outputs[no.first] = create_host_blob(tensorDesc);
|
||||
if (_outputs.find(no.first) == _outputs.end()) {
|
||||
_outputs[no.first] = create_host_blob(tensorDesc);
|
||||
} else {
|
||||
_outputs[no.first]->setShape(dims);
|
||||
}
|
||||
}
|
||||
Blob::Ptr bptr = _outputs[no.first];
|
||||
|
||||
@@ -593,9 +592,13 @@ void InferRequest::setup_stream_graph() {
|
||||
m_graph = streamGraphs[streamID];
|
||||
}
|
||||
|
||||
Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc, std::shared_ptr<InferenceEngine::IAllocator> alloc) {
|
||||
Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc) {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::create_host_blob");
|
||||
auto blob = make_blob_with_precision(desc, alloc ? alloc : CreateDefaultAllocator());
|
||||
// Disable USM usage as USMHostAllocator may fail for attempt to allocate 0 bytes
|
||||
// If we add WA for such case to avoid driver call, then deallocate method will return false and Blob::setShape call will throw an exception
|
||||
bool use_usm = m_graph->GetEngine()->use_unified_shared_memory() && !m_graph->GetNetwork()->is_dynamic();
|
||||
auto alloc = use_usm ? std::make_shared<USMHostAllocator>(m_graph->GetContext().get()) : CreateDefaultAllocator();
|
||||
auto blob = make_blob_with_precision(desc, alloc);
|
||||
blob->allocate();
|
||||
return blob;
|
||||
}
|
||||
@@ -715,6 +718,8 @@ void InferRequest::allocate_inputs() {
|
||||
IE_THROW() << "Input layout for " << name << " is not found";
|
||||
}
|
||||
|
||||
auto input_layout = litr->second;
|
||||
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << name << ": input blob]" << std::endl;
|
||||
@@ -722,24 +727,20 @@ void InferRequest::allocate_inputs() {
|
||||
if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) {
|
||||
TensorDesc desc_fp32 = desc;
|
||||
desc_fp32.setPrecision(Precision::FP32);
|
||||
auto blobPtr = create_device_blob(desc_fp32);
|
||||
_deviceInputs[name] = blobPtr;
|
||||
Blob::Ptr inputBlob = create_host_blob(desc);
|
||||
_inputs[name] = inputBlob;
|
||||
_inputs[name] = create_host_blob(desc);
|
||||
if (input_layout.is_static())
|
||||
_deviceInputs[name] = create_device_blob(desc_fp32);
|
||||
} else {
|
||||
if (m_graph->GetEngine()->use_unified_shared_memory()) {
|
||||
// For USM case we create host blob using custom USM host allocator
|
||||
// and then create shared device blob on top of this buffer
|
||||
if (_inputs.find(name) == _inputs.end()) {
|
||||
auto host_blob = create_host_blob(desc, std::make_shared<USMHostAllocator>(m_graph->GetContext().get()));
|
||||
_inputs[name] = host_blob;
|
||||
_deviceInputs[name] = create_shared_device_blob(desc, litr->second, host_blob->buffer().as<void*>());
|
||||
_inputs[name] = create_host_blob(desc);
|
||||
if (input_layout.is_static()) {
|
||||
if (m_graph->GetEngine()->use_unified_shared_memory()) {
|
||||
// For USM case we create host blob using custom USM host allocator
|
||||
// and then create shared device blob on top of this buffer
|
||||
auto host_blob = _inputs[name];
|
||||
_deviceInputs[name] = create_shared_device_blob(desc, input_layout, host_blob->buffer().as<void*>());
|
||||
} else {
|
||||
_deviceInputs[name] = create_device_blob(desc);
|
||||
}
|
||||
} else {
|
||||
_inputs[name] = create_host_blob(desc);
|
||||
_deviceInputs[name] = create_device_blob(desc);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -748,15 +749,17 @@ void InferRequest::allocate_inputs() {
|
||||
|
||||
void InferRequest::allocate_outputs() {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::allocate_outputs");
|
||||
|
||||
// allocate outputs
|
||||
for (auto& no : _networkOutputs) {
|
||||
std::string outputID = m_graph->MapOutputName(no.first);
|
||||
const cldnn::layout output_layout = m_graph->GetNetwork()->get_output_memory(outputID)->get_layout();
|
||||
const cldnn::layout output_layout = m_graph->GetNetwork()->get_node_output_layout(outputID);
|
||||
TensorDesc desc = no.second->getTensorDesc();
|
||||
// Due to some reason TensorDesc in InferRequest contains wrong dims
|
||||
// while ExecutableNetwork contains proper ones. Thus replace dims with once from exec network
|
||||
// Can be removed once 76176 is resolved.
|
||||
desc.setDims(m_graph->GetOutputSize(no.first));
|
||||
if (output_layout.is_static())
|
||||
desc.setDims(m_graph->GetOutputSize(no.first));
|
||||
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
@@ -774,20 +777,20 @@ void InferRequest::allocate_outputs() {
|
||||
else
|
||||
device_blob_desc.setPrecision(Precision::FP32);
|
||||
|
||||
auto host_blob = create_host_blob(desc);
|
||||
_outputs[no.first] = host_blob;
|
||||
auto device_blob = create_device_blob(device_blob_desc);
|
||||
_deviceOutputs[no.first] = device_blob;
|
||||
_outputs[no.first] = create_host_blob(desc);
|
||||
if (output_layout.is_static())
|
||||
_deviceOutputs[no.first] = create_device_blob(device_blob_desc);
|
||||
} else {
|
||||
if (m_graph->GetEngine()->use_unified_shared_memory()) {
|
||||
// For USM case we create host blob using custom USM host allocator
|
||||
// and then create shared device blob on top of this buffer
|
||||
auto host_blob = create_host_blob(desc, std::make_shared<USMHostAllocator>(m_graph->GetContext().get()));
|
||||
_outputs[no.first] = host_blob;
|
||||
_deviceOutputs[no.first] = create_shared_device_blob(desc, output_layout, host_blob->buffer().as<void*>());
|
||||
} else {
|
||||
_outputs[no.first] = create_host_blob(desc);
|
||||
_deviceOutputs[no.first] = create_device_blob(desc);
|
||||
_outputs[no.first] = create_host_blob(desc);
|
||||
if (output_layout.is_static()) {
|
||||
if (m_graph->GetEngine()->use_unified_shared_memory()) {
|
||||
// For USM case we create host blob using custom USM host allocator
|
||||
// and then create shared device blob on top of this buffer
|
||||
auto host_blob = _outputs[no.first];
|
||||
_deviceOutputs[no.first] = create_shared_device_blob(desc, output_layout, host_blob->buffer().as<void*>());
|
||||
} else {
|
||||
_deviceOutputs[no.first] = create_device_blob(desc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -817,6 +820,24 @@ void InferRequest::prepare_input(const cldnn::primitive_id& inputName, Blob::Ptr
|
||||
if (inputLayoutItr == m_graph->GetInputLayouts().end()) {
|
||||
IE_THROW() << "Input name mismatch.";
|
||||
}
|
||||
auto input_layout = inputLayoutItr->second;
|
||||
if (input_layout.is_dynamic()) {
|
||||
bool has_device_blob = _deviceInputs.find(inputName) != _deviceInputs.end();
|
||||
bool should_allocate_device_blob = !has_device_blob;
|
||||
if (has_device_blob) {
|
||||
auto device_blob = _deviceInputs.at(inputName);
|
||||
if (device_blob->byteSize() < inputBlob->byteSize()) {
|
||||
should_allocate_device_blob = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (should_allocate_device_blob) {
|
||||
_deviceInputs[inputName] = create_device_blob(inputBlob->getTensorDesc());
|
||||
} else {
|
||||
_deviceInputs[inputName] = reinterpret_device_blob(_deviceInputs[inputName], inputBlob->getTensorDesc());
|
||||
}
|
||||
}
|
||||
OPENVINO_ASSERT(_deviceInputs.find(inputName) != _deviceInputs.end(), "[GPU] Couldn't find device blob allocated for ", inputName, " input");
|
||||
auto reqBlob = _deviceInputs.at(inputName)->as<gpu::ClBlob>();
|
||||
auto _nw_ptr = m_graph->GetNetwork();
|
||||
cldnn::primitive_id internalName = "parameter:" + inputName;
|
||||
@@ -848,7 +869,7 @@ void InferRequest::prepare_input(const cldnn::primitive_id& inputName, Blob::Ptr
|
||||
|
||||
auto input_layout = m_graph->GetInputLayouts().find(inputName);
|
||||
if (input_layout != m_graph->GetInputLayouts().end()) {
|
||||
if (input_layout->second.format != inputMem->get_layout().format) {
|
||||
if (input_layout->second.format != inputMem->get_layout().format && input_layout->second.is_static()) {
|
||||
inputMem = m_graph->GetNetwork()->get_engine().reinterpret_buffer(*inputMem, input_layout->second);
|
||||
}
|
||||
}
|
||||
@@ -891,6 +912,9 @@ void InferRequest::prepare_input(const cldnn::primitive_id& inputName, Blob::Ptr
|
||||
|
||||
void InferRequest::prepare_output(const cldnn::primitive_id& outputName, Blob::Ptr& outputBlob) {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::prepare_output");
|
||||
// Missing output in _deviceOutputs means that the network is dynamic and outputs couldn't be pre-allocated
|
||||
if (_deviceOutputs.find(outputName) == _deviceOutputs.end())
|
||||
return;
|
||||
Blob::Ptr reqBlob = _deviceOutputs.at(outputName);
|
||||
cldnn::primitive_id internalName = outputsMap[outputName];
|
||||
auto _nw_ptr = m_graph->GetNetwork();
|
||||
@@ -921,7 +945,7 @@ InferenceEngine::Blob::Ptr InferRequest::create_device_blob(const InferenceEngin
|
||||
nullptr,
|
||||
0,
|
||||
0,
|
||||
RemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL);
|
||||
RemoteBlobImpl::BlobType::BT_USM_DEVICE_INTERNAL);
|
||||
getBlobImpl(blobPtr.get())->allocate();
|
||||
return blobPtr;
|
||||
} else {
|
||||
|
||||
@@ -28,12 +28,14 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
|
||||
auto inputInfo = networkInputs.at(op->get_friendly_name());
|
||||
// first create and add the input layout
|
||||
const auto inputDesc = inputInfo->getTensorDesc();
|
||||
auto inputDims = op->get_partial_shape();
|
||||
auto input_pshape = op->get_partial_shape();
|
||||
InferenceEngine::Layout l = inputDesc.getLayout();
|
||||
InferenceEngine::Precision ip = inputDesc.getPrecision();
|
||||
|
||||
cldnn::format inputFormat = cldnn::format::bfyx;
|
||||
if (InferenceEngine::Layout::BLOCKED == l && 6 == inputDims.size()) {
|
||||
if (input_pshape.is_dynamic()) {
|
||||
inputFormat = cldnn::format::get_default_format(input_pshape.size());
|
||||
} else if (InferenceEngine::Layout::BLOCKED == l && 6 == input_pshape.size()) {
|
||||
inputFormat = cldnn::format::bfwzyx;
|
||||
} else {
|
||||
inputFormat = FormatFromLayout(l);
|
||||
@@ -43,7 +45,7 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
|
||||
auto inputName = layer_type_name_ID(op);
|
||||
auto preProcess = inputInfo->getPreProcess();
|
||||
size_t meanChannels = preProcess.getNumberOfChannels();
|
||||
cldnn::layout networkInputLayout(inputDims,
|
||||
cldnn::layout networkInputLayout(input_pshape,
|
||||
cldnn::element_type_to_data_type(op->get_output_element_type(0)),
|
||||
inputFormat);
|
||||
cldnn::primitive_id meanBlobID = inputName + Program::m_meanValuesTag;
|
||||
@@ -70,7 +72,7 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
|
||||
IE_ASSERT(meanChannels);
|
||||
// first merge all mean values to a single blob
|
||||
// todo make sure mean blob precision is the same as the input precision
|
||||
auto meanDims = inputDims;
|
||||
auto meanDims = input_pshape;
|
||||
// overwrite batches with 1
|
||||
switch (meanDims.size()) {
|
||||
case 4: meanDims[0] = 1;
|
||||
@@ -155,10 +157,10 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
|
||||
}
|
||||
|
||||
if (networkInputLayout.format == cldnn::format::nv12 && networkInputLayout.get_tensor().batch[0] > 1) {
|
||||
networkInputLayout.set_partial_shape({ 1, inputDims[3], inputDims[1], inputDims[2] });
|
||||
networkInputLayout.set_partial_shape({ 1, input_pshape[3], input_pshape[1], input_pshape[2] });
|
||||
|
||||
std::vector<cldnn::primitive_id> inputs;
|
||||
for (int64_t i = 0; i < inputDims[0].get_length(); ++i) {
|
||||
for (int64_t i = 0; i < input_pshape[0].get_length(); ++i) {
|
||||
std::string batched_name = inputName + "_" + std::to_string(i);
|
||||
p.inputLayouts.insert({ inputInfo->name() + "_" + std::to_string(i), networkInputLayout });
|
||||
inputs.emplace_back(batched_name);
|
||||
@@ -166,7 +168,7 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
|
||||
}
|
||||
p.primitive_ids[inputName] = inputName;
|
||||
} else {
|
||||
networkInputLayout.set_partial_shape({ inputDims[0], inputDims[3], inputDims[1], inputDims[2] });
|
||||
networkInputLayout.set_partial_shape({ input_pshape[0], input_pshape[3], input_pshape[1], input_pshape[2] });
|
||||
|
||||
p.inputLayouts.insert({ inputInfo->name(), networkInputLayout });
|
||||
p.add_primitive(*op, cldnn::input_layout(inputName, networkInputLayout));
|
||||
@@ -180,9 +182,9 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
|
||||
IE_THROW() << "Unsupported layout (" << l << ") or precision "
|
||||
<< ip.name() << ") for NV12 input " + inputInfo->name();
|
||||
}
|
||||
int height = inputDims[2].get_length();
|
||||
int width = inputDims[3].get_length();
|
||||
size_t batch = inputDims[0].get_length();
|
||||
int height = input_pshape[2].get_length();
|
||||
int width = input_pshape[3].get_length();
|
||||
size_t batch = input_pshape[0].get_length();
|
||||
std::vector<cldnn::primitive_id> reorders;
|
||||
|
||||
for (size_t i = 0; i < batch; i++) {
|
||||
@@ -228,7 +230,7 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
|
||||
reorders.push_back(preprocessPrimID);
|
||||
}
|
||||
|
||||
if (inputDims[0].get_length() > 1) {
|
||||
if (input_pshape[0].get_length() > 1) {
|
||||
auto concatPrimID = "concat:" + inputName + Program::m_preProcessTag;
|
||||
p.add_primitive(*op, cldnn::concatenation(concatPrimID, reorders, 0));
|
||||
}
|
||||
|
||||
@@ -66,9 +66,7 @@ static void CreateResultOp(Program& p, const std::shared_ptr<ngraph::op::v0::Res
|
||||
auto reorder_primitive = cldnn::reorder(outLayerName,
|
||||
outputID,
|
||||
FormatFromLayout(outputlayout),
|
||||
DataTypeFromPrecision(precision),
|
||||
std::vector<float>(),
|
||||
cldnn::reorder_mean_mode::subtract);
|
||||
DataTypeFromPrecision(precision));
|
||||
p.add_primitive(*op, reorder_primitive, {originalOutName});
|
||||
p.outputDims[originalOutName] = outputDesc.getDims();
|
||||
p.prevPrimitiveIDs[outLayerName] = {originalOutName};
|
||||
|
||||
@@ -261,8 +261,7 @@ static void CreateLSTMSequenceOp(Program& p, const std::shared_ptr<ngraph::op::v
|
||||
|
||||
hiddenStr = crop_id + ":hidden";
|
||||
cellStr = crop_id + ":cell";
|
||||
cldnn::primitive_id outputHiddenID = layerName + ".out1";
|
||||
p.add_primitive(*op, cldnn::crop(hiddenStr, lstm_elt_id, hiddenSz, cldnn::tensor{ 0, 0, 0, 0 }), {outputHiddenID});
|
||||
p.add_primitive(*op, cldnn::crop(hiddenStr, lstm_elt_id, hiddenSz, cldnn::tensor{ 0, 0, 0, 0 }));
|
||||
output_ids_offsets.push_back(hiddenStr);
|
||||
|
||||
if (i < lstm_sequence_len - 1) {
|
||||
@@ -271,16 +270,18 @@ static void CreateLSTMSequenceOp(Program& p, const std::shared_ptr<ngraph::op::v
|
||||
// last hidden state crop (output 2)
|
||||
|
||||
// last cell state crop (output 3)
|
||||
cldnn::primitive_id outputCellID = layerName + ".out2";
|
||||
p.add_primitive(*op, cldnn::crop(cellStr, lstm_elt_id, hiddenSz, cellCropSz), {outputCellID});
|
||||
p.add_primitive(*op, cldnn::crop(cellStr, lstm_elt_id, hiddenSz, cellCropSz));
|
||||
}
|
||||
}
|
||||
|
||||
if (!isForward) std::reverse(output_ids_offsets.begin(), output_ids_offsets.end());
|
||||
// concatenated hidden state (output 1)
|
||||
cldnn::primitive_id outputConcatID = layerName + ".out0";
|
||||
cldnn::primitive_id concatStr = layerName + ":hiddenConcat";
|
||||
p.add_primitive(*op, cldnn::concatenation(concatStr, output_ids_offsets, 1), {outputConcatID, layerName});
|
||||
p.add_primitive(*op, cldnn::concatenation(concatStr, output_ids_offsets, 1));
|
||||
|
||||
p.add_primitive(*op, cldnn::reshape(layerName + ".out0", concatStr, tensor_from_dims(op->get_output_shape(0))), {layerName});
|
||||
p.add_primitive(*op, cldnn::reshape(layerName + ".out1", hiddenStr, tensor_from_dims(op->get_output_shape(1))));
|
||||
p.add_primitive(*op, cldnn::reshape(layerName + ".out2", cellStr, tensor_from_dims(op->get_output_shape(2))));
|
||||
}
|
||||
|
||||
REGISTER_FACTORY_IMPL(v4, LSTMCell);
|
||||
|
||||
Reference in New Issue
Block a user