[GPU] Make ShapePredictor instance unique for each InferRequest instead of the cldnn::network (#21019)

2023-11-13 08:52:07 +04:00 · 2023-11-13 08:52:07 +04:00 · 306137f86b
commit 306137f86b
parent 51da30b48d
6 changed files with 26 additions and 12 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
@ -247,7 +247,8 @@ public:
    const variables_state_info_map& get_variables_state_info() const;
    const ExecutionConfig& get_config() const { return _config; }

-    ShapePredictor& get_shape_predictor() { return *_shape_predictor; }
+    std::shared_ptr<ShapePredictor> get_shape_predictor() { return _shape_predictor; }
+    void set_shape_predictor(std::shared_ptr<ShapePredictor> shape_predictor) { _shape_predictor = shape_predictor; }

 #ifdef GPU_DEBUG_CONFIG
    int64_t get_current_iteration_num() { return iteration; }
@ -287,7 +288,7 @@ private:
    std::unordered_map<primitive_id, event::ptr> _old_events;
    output_chains_map _output_chains;

-    std::unique_ptr<ShapePredictor> _shape_predictor;
+    std::shared_ptr<ShapePredictor> _shape_predictor;

    void build_exec_order();
    void allocate_primitive_instance(program_node const& node);
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp
@ -83,6 +83,7 @@ private:
    std::shared_ptr<Graph> m_graph;
    RemoteContextImpl::Ptr m_context = nullptr;
    std::shared_ptr<ov::threading::IStreamsExecutor> m_stream_executor = nullptr;
+    std::shared_ptr<cldnn::ShapePredictor> m_shape_predictor = nullptr;
    bool m_enable_profiling = false;
    bool m_use_external_queue = false;

--- a/src/plugins/intel_gpu/src/graph/impls/common/condition.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/common/condition.cpp
@ -37,8 +37,9 @@ struct condition_impl : typed_primitive_impl<condition> {
        set_node_params(instance.get_node());

        auto pred = condition_inst::get_pred_from_memory(instance.pred_memory_ptr(), instance.get_network().get_stream());
-        network::ptr executed_net = pred? instance.get_net_true() : instance.get_net_false();
-        auto branch = pred? instance.get_branch_true() : instance.get_branch_false();
+        network::ptr executed_net = pred ? instance.get_net_true() : instance.get_net_false();
+        auto branch = pred ? instance.get_branch_true() : instance.get_branch_false();
+        executed_net->set_shape_predictor(instance.get_network().get_shape_predictor());
        GPU_DEBUG_LOG << "predicate: " << (pred ? "True" : "False") << std::endl;

        // Set input memory of inner network before its execution
--- a/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp
@ -121,6 +121,7 @@ struct loop_impl : typed_primitive_impl<loop> {

        auto ev = stream.create_user_event(false);

+        body_network->set_shape_predictor(outer_network.get_shape_predictor());
        OPENVINO_ASSERT(!primitive->num_iteration_id.empty(), "loop operation should have num_iteration_id");

        auto num_iterations = instance.get_num_iterations();
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@ -427,7 +427,7 @@ event::ptr primitive_inst::realloc_if_needed() {
    }

    auto current_shape = actual_layout.get_shape();
-    auto& sp = get_network().get_shape_predictor();
+    auto& sp = *get_network().get_shape_predictor();
    auto dt_size = ov::element::Type(actual_layout.data_type).bitwidth();
    auto prealloc_info = sp.predict_preallocation_shape(id(), current_shape, dt_size, can_reuse_buffer);
    if (prealloc_info.first && sp.can_preallocate(ov::shape_size(prealloc_info.second) * dt_size)) {
--- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
+++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
@ -226,6 +226,7 @@ SyncInferRequest::SyncInferRequest(const std::shared_ptr<const CompiledModel>& c
    : ov::ISyncInferRequest(compiled_model)
    , m_graph(compiled_model->get_graph(0))
    , m_context(std::static_pointer_cast<RemoteContextImpl>(compiled_model->get_context_impl()))
+    , m_shape_predictor(new cldnn::ShapePredictor(&m_graph->get_engine(), m_graph->get_config().get_property(ov::intel_gpu::buffers_preallocation_ratio)))
    , m_enable_profiling(m_graph->get_config().get_property(ov::enable_profiling))
    , m_use_external_queue(m_graph->use_external_queue()) {
    bool is_legacy_api = !compiled_model->is_new_api();
@ -233,6 +234,17 @@ SyncInferRequest::SyncInferRequest(const std::shared_ptr<const CompiledModel>& c
    allocate_inputs();
    allocate_outputs();
    allocate_states();
+
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    GPU_DEBUG_IF(debug_config->mem_preallocation_params.is_initialized) {
+        auto& mem_preallocation_params = debug_config->mem_preallocation_params;
+        m_shape_predictor.reset(
+            new cldnn::ShapePredictor(&m_graph->get_engine(),
+                                      mem_preallocation_params.next_iters_preallocation_count,
+                                      mem_preallocation_params.max_per_iter_size,
+                                      mem_preallocation_params.max_per_dim_diff,
+                                      mem_preallocation_params.buffers_preallocation_ratio));
+    }
 }

 void SyncInferRequest::infer() {
@ -401,6 +413,7 @@ void SyncInferRequest::enqueue() {

    auto network = m_graph->get_network();
    network->assign_variables_memories();
+    network->set_shape_predictor(m_shape_predictor);

    m_internal_outputs.clear();
    m_internal_outputs = network->execute(dependencies);
@ -476,8 +489,7 @@ void SyncInferRequest::wait() {
                    need_reallocate = usm_host_tensor->get_impl()->get_original_memory()->size() < output_memory->size();

                if (need_reallocate) {
-                    auto& shape_predictor = m_graph->get_network()->get_shape_predictor();
-                    auto actual_memory_shape = predict_shape(name, mem_shape, output_tensor->get_element_type(), shape_predictor);
+                    auto actual_memory_shape = predict_shape(name, mem_shape, output_tensor->get_element_type(), *m_shape_predictor);
                    output_tensor->set_shape(actual_memory_shape);
                }
            }
@ -585,8 +597,7 @@ TensorWrapper SyncInferRequest::create_or_share_device_tensor(const TensorWrappe

    auto actual_memory_shape = tensor_shape;
    if (is_dynamic) {
-        auto& shape_predictor = m_graph->get_network()->get_shape_predictor();
-        actual_memory_shape = predict_shape(name, tensor_shape, element_type, shape_predictor);
+        actual_memory_shape = predict_shape(name, tensor_shape, element_type, *m_shape_predictor);
    }

    return { create_device_tensor(actual_memory_shape, element_type, need_lockable_mem), TensorOwner::PLUGIN };
@ -746,7 +757,7 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string

    if (is_remote) {
        m_plugin_inputs[name] = user_tensor_wrapper;
-    } else if (is_usm_host_tensor && !convert_needed) {
+    } else if (is_usm_host_tensor && !convert_needed && can_use_usm_host(engine)) {
        m_plugin_inputs[name] = {usm_host_ptr->get_impl(), user_tensor_wrapper.owner};
        is_remote = true;
    }
@ -762,8 +773,7 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string
        auto device_tensor = std::dynamic_pointer_cast<RemoteTensorImpl>(device_tensor_wrapper.ptr);
        if (is_dynamic) {
            if (device_tensor->get_original_memory()->size() < user_tensor->get_byte_size()) {
-                auto& shape_predictor = network->get_shape_predictor();
-                auto actual_shape = predict_shape(name, user_tensor->get_shape(), device_tensor_et, shape_predictor);
+                auto actual_shape = predict_shape(name, user_tensor->get_shape(), device_tensor_et, *m_shape_predictor);
                GPU_DEBUG_TRACE_DETAIL << "    actual memory shape: " << actual_shape.to_string() << std::endl;
                auto new_tensor = create_device_tensor(actual_shape, device_tensor_et, false);
                new_tensor->set_shape(user_tensor->get_shape());