[GPU] Reduce host time overhead in ReadValue execution for stateful model (#21521)

2023-12-13 19:06:30 +09:00 · 2023-12-13 19:06:30 +09:00 · 71e7015d39
commit 71e7015d39
parent 3fb60dc41c
2 changed files with 23 additions and 2 deletions
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
@ -14,6 +14,7 @@
 #include "resample_inst.h"
 #include "loop_inst.h"
 #include "strided_slice_inst.h"
+#include "shape_of_inst.h"
 #include "non_max_suppression_inst.h"
 #include "experimental_detectron_roi_feature_extractor_inst.hpp"
 #include "border_inst.h"
@ -407,6 +408,19 @@ static bool can_crop_be_optimized_along_batch(const crop_node& node) {
    return false;
 }

+static bool can_read_value_be_optimize(const read_value_node& node) {
+    if (node.get_users().size() == 1)
+        return true;
+
+    const auto non_shape_of_users_count = std::count_if(node.get_users().begin(), node.get_users().end(), [](const program_node* user) {
+        return !user->is_type<shape_of>();
+    });
+    if (non_shape_of_users_count <= 1)
+        return true;
+
+    return false;
+}
+
 static void propagate_padding_to_opt_out_users(program_node& node, cldnn::padding padding_data) {
    if (padding_data == cldnn::padding())
        return;
@ -632,10 +646,10 @@ void prepare_buffer_fusing::run(program& p) {
            //                   ┌────┴──────┐
            //                   │  Result   │
            //                   └───────────┘
-            // If read_value here returns virable memory w/o copy, then based on Add-s and Assign execution order we may have different results
+            // If read_value here returns variable memory w/o copy, then based on Add-s and Assign execution order we may have different results
            // TODO: Allow optimizations for the case above too. Looks like it can be achieved by more careful
            // topological sort (i.e. if we ensure that all read_value users are completed before assign is run)
-            node.can_be_optimized(node.get_users().size() == 1);
+            node.can_be_optimized(can_read_value_be_optimize(node));
        });
    }
 }
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@ -445,10 +445,17 @@ event::ptr primitive_inst::realloc_if_needed() {
    if (_node->is_type<input_layout>())
        return ev;

+    // read_value/assign nodes are supposed to always use variable memory
    if (auto stateful_prim = dynamic_cast<memory_state::variable*>(this)) {
        std::string variable_id = stateful_prim->variable_id();
        auto variable = get_network().get_variable(variable_id);
        variable.set_layout(actual_layout);
+        GPU_DEBUG_TRACE_DETAIL << id() << ": use variable memory " << variable.get_memory()
+                               << " (size=" << variable.get_memory()->size() << ")" << std::endl;
+        // For nodes that can be optimized, variable memory is used as output memory
+        // so there is no need for output memory reallocation
+        if (can_be_optimized())
+            return ev;
    }

    // Update output layout with respect to FC's fake alignment