[GPU] Make ShapePredictor instance unique for each InferRequest instead of the cldnn::network (#21019)
This commit is contained in:
parent
51da30b48d
commit
306137f86b
@ -247,7 +247,8 @@ public:
|
||||
const variables_state_info_map& get_variables_state_info() const;
|
||||
const ExecutionConfig& get_config() const { return _config; }
|
||||
|
||||
ShapePredictor& get_shape_predictor() { return *_shape_predictor; }
|
||||
std::shared_ptr<ShapePredictor> get_shape_predictor() { return _shape_predictor; }
|
||||
void set_shape_predictor(std::shared_ptr<ShapePredictor> shape_predictor) { _shape_predictor = shape_predictor; }
|
||||
|
||||
#ifdef GPU_DEBUG_CONFIG
|
||||
int64_t get_current_iteration_num() { return iteration; }
|
||||
@ -287,7 +288,7 @@ private:
|
||||
std::unordered_map<primitive_id, event::ptr> _old_events;
|
||||
output_chains_map _output_chains;
|
||||
|
||||
std::unique_ptr<ShapePredictor> _shape_predictor;
|
||||
std::shared_ptr<ShapePredictor> _shape_predictor;
|
||||
|
||||
void build_exec_order();
|
||||
void allocate_primitive_instance(program_node const& node);
|
||||
|
@ -83,6 +83,7 @@ private:
|
||||
std::shared_ptr<Graph> m_graph;
|
||||
RemoteContextImpl::Ptr m_context = nullptr;
|
||||
std::shared_ptr<ov::threading::IStreamsExecutor> m_stream_executor = nullptr;
|
||||
std::shared_ptr<cldnn::ShapePredictor> m_shape_predictor = nullptr;
|
||||
bool m_enable_profiling = false;
|
||||
bool m_use_external_queue = false;
|
||||
|
||||
|
@ -37,8 +37,9 @@ struct condition_impl : typed_primitive_impl<condition> {
|
||||
set_node_params(instance.get_node());
|
||||
|
||||
auto pred = condition_inst::get_pred_from_memory(instance.pred_memory_ptr(), instance.get_network().get_stream());
|
||||
network::ptr executed_net = pred? instance.get_net_true() : instance.get_net_false();
|
||||
auto branch = pred? instance.get_branch_true() : instance.get_branch_false();
|
||||
network::ptr executed_net = pred ? instance.get_net_true() : instance.get_net_false();
|
||||
auto branch = pred ? instance.get_branch_true() : instance.get_branch_false();
|
||||
executed_net->set_shape_predictor(instance.get_network().get_shape_predictor());
|
||||
GPU_DEBUG_LOG << "predicate: " << (pred ? "True" : "False") << std::endl;
|
||||
|
||||
// Set input memory of inner network before its execution
|
||||
|
@ -121,6 +121,7 @@ struct loop_impl : typed_primitive_impl<loop> {
|
||||
|
||||
auto ev = stream.create_user_event(false);
|
||||
|
||||
body_network->set_shape_predictor(outer_network.get_shape_predictor());
|
||||
OPENVINO_ASSERT(!primitive->num_iteration_id.empty(), "loop operation should have num_iteration_id");
|
||||
|
||||
auto num_iterations = instance.get_num_iterations();
|
||||
|
@ -427,7 +427,7 @@ event::ptr primitive_inst::realloc_if_needed() {
|
||||
}
|
||||
|
||||
auto current_shape = actual_layout.get_shape();
|
||||
auto& sp = get_network().get_shape_predictor();
|
||||
auto& sp = *get_network().get_shape_predictor();
|
||||
auto dt_size = ov::element::Type(actual_layout.data_type).bitwidth();
|
||||
auto prealloc_info = sp.predict_preallocation_shape(id(), current_shape, dt_size, can_reuse_buffer);
|
||||
if (prealloc_info.first && sp.can_preallocate(ov::shape_size(prealloc_info.second) * dt_size)) {
|
||||
|
@ -226,6 +226,7 @@ SyncInferRequest::SyncInferRequest(const std::shared_ptr<const CompiledModel>& c
|
||||
: ov::ISyncInferRequest(compiled_model)
|
||||
, m_graph(compiled_model->get_graph(0))
|
||||
, m_context(std::static_pointer_cast<RemoteContextImpl>(compiled_model->get_context_impl()))
|
||||
, m_shape_predictor(new cldnn::ShapePredictor(&m_graph->get_engine(), m_graph->get_config().get_property(ov::intel_gpu::buffers_preallocation_ratio)))
|
||||
, m_enable_profiling(m_graph->get_config().get_property(ov::enable_profiling))
|
||||
, m_use_external_queue(m_graph->use_external_queue()) {
|
||||
bool is_legacy_api = !compiled_model->is_new_api();
|
||||
@ -233,6 +234,17 @@ SyncInferRequest::SyncInferRequest(const std::shared_ptr<const CompiledModel>& c
|
||||
allocate_inputs();
|
||||
allocate_outputs();
|
||||
allocate_states();
|
||||
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->mem_preallocation_params.is_initialized) {
|
||||
auto& mem_preallocation_params = debug_config->mem_preallocation_params;
|
||||
m_shape_predictor.reset(
|
||||
new cldnn::ShapePredictor(&m_graph->get_engine(),
|
||||
mem_preallocation_params.next_iters_preallocation_count,
|
||||
mem_preallocation_params.max_per_iter_size,
|
||||
mem_preallocation_params.max_per_dim_diff,
|
||||
mem_preallocation_params.buffers_preallocation_ratio));
|
||||
}
|
||||
}
|
||||
|
||||
void SyncInferRequest::infer() {
|
||||
@ -401,6 +413,7 @@ void SyncInferRequest::enqueue() {
|
||||
|
||||
auto network = m_graph->get_network();
|
||||
network->assign_variables_memories();
|
||||
network->set_shape_predictor(m_shape_predictor);
|
||||
|
||||
m_internal_outputs.clear();
|
||||
m_internal_outputs = network->execute(dependencies);
|
||||
@ -476,8 +489,7 @@ void SyncInferRequest::wait() {
|
||||
need_reallocate = usm_host_tensor->get_impl()->get_original_memory()->size() < output_memory->size();
|
||||
|
||||
if (need_reallocate) {
|
||||
auto& shape_predictor = m_graph->get_network()->get_shape_predictor();
|
||||
auto actual_memory_shape = predict_shape(name, mem_shape, output_tensor->get_element_type(), shape_predictor);
|
||||
auto actual_memory_shape = predict_shape(name, mem_shape, output_tensor->get_element_type(), *m_shape_predictor);
|
||||
output_tensor->set_shape(actual_memory_shape);
|
||||
}
|
||||
}
|
||||
@ -585,8 +597,7 @@ TensorWrapper SyncInferRequest::create_or_share_device_tensor(const TensorWrappe
|
||||
|
||||
auto actual_memory_shape = tensor_shape;
|
||||
if (is_dynamic) {
|
||||
auto& shape_predictor = m_graph->get_network()->get_shape_predictor();
|
||||
actual_memory_shape = predict_shape(name, tensor_shape, element_type, shape_predictor);
|
||||
actual_memory_shape = predict_shape(name, tensor_shape, element_type, *m_shape_predictor);
|
||||
}
|
||||
|
||||
return { create_device_tensor(actual_memory_shape, element_type, need_lockable_mem), TensorOwner::PLUGIN };
|
||||
@ -746,7 +757,7 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string
|
||||
|
||||
if (is_remote) {
|
||||
m_plugin_inputs[name] = user_tensor_wrapper;
|
||||
} else if (is_usm_host_tensor && !convert_needed) {
|
||||
} else if (is_usm_host_tensor && !convert_needed && can_use_usm_host(engine)) {
|
||||
m_plugin_inputs[name] = {usm_host_ptr->get_impl(), user_tensor_wrapper.owner};
|
||||
is_remote = true;
|
||||
}
|
||||
@ -762,8 +773,7 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string
|
||||
auto device_tensor = std::dynamic_pointer_cast<RemoteTensorImpl>(device_tensor_wrapper.ptr);
|
||||
if (is_dynamic) {
|
||||
if (device_tensor->get_original_memory()->size() < user_tensor->get_byte_size()) {
|
||||
auto& shape_predictor = network->get_shape_predictor();
|
||||
auto actual_shape = predict_shape(name, user_tensor->get_shape(), device_tensor_et, shape_predictor);
|
||||
auto actual_shape = predict_shape(name, user_tensor->get_shape(), device_tensor_et, *m_shape_predictor);
|
||||
GPU_DEBUG_TRACE_DETAIL << " actual memory shape: " << actual_shape.to_string() << std::endl;
|
||||
auto new_tensor = create_device_tensor(actual_shape, device_tensor_et, false);
|
||||
new_tensor->set_shape(user_tensor->get_shape());
|
||||
|
Loading…
Reference in New Issue
Block a user