From 71e7015d39c52339bacf146a6c1834ca43283277 Mon Sep 17 00:00:00 2001
From: Andrew Kwangwoong Park <andrew.park@intel.com>
Date: Wed, 13 Dec 2023 19:06:30 +0900
Subject: [PATCH] [GPU] Reduce host time overhead in ReadValue execution for
 stateful model (#21521)

---
 .../graph_optimizer/prepare_buffer_fusing.cpp  | 18 ++++++++++++++++--
 .../intel_gpu/src/graph/primitive_inst.cpp     |  7 +++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
index 95e6079f48f..db67b47cb71 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
@@ -14,6 +14,7 @@
 #include "resample_inst.h"
 #include "loop_inst.h"
 #include "strided_slice_inst.h"
+#include "shape_of_inst.h"
 #include "non_max_suppression_inst.h"
 #include "experimental_detectron_roi_feature_extractor_inst.hpp"
 #include "border_inst.h"
@@ -407,6 +408,19 @@ static bool can_crop_be_optimized_along_batch(const crop_node& node) {
     return false;
 }
 
+static bool can_read_value_be_optimize(const read_value_node& node) {
+    if (node.get_users().size() == 1)
+        return true;
+
+    const auto non_shape_of_users_count = std::count_if(node.get_users().begin(), node.get_users().end(), [](const program_node* user) {
+        return !user->is_type<shape_of>();
+    });
+    if (non_shape_of_users_count <= 1)
+        return true;
+
+    return false;
+}
+
 static void propagate_padding_to_opt_out_users(program_node& node, cldnn::padding padding_data) {
     if (padding_data == cldnn::padding())
         return;
@@ -632,10 +646,10 @@ void prepare_buffer_fusing::run(program& p) {
             //                   ┌────┴──────┐
             //                   │  Result   │
             //                   └───────────┘
-            // If read_value here returns virable memory w/o copy, then based on Add-s and Assign execution order we may have different results
+            // If read_value here returns variable memory w/o copy, then based on Add-s and Assign execution order we may have different results
             // TODO: Allow optimizations for the case above too. Looks like it can be achieved by more careful
             // topological sort (i.e. if we ensure that all read_value users are completed before assign is run)
-            node.can_be_optimized(node.get_users().size() == 1);
+            node.can_be_optimized(can_read_value_be_optimize(node));
         });
     }
 }
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
index 8c3b430e3ef..cf4b84fee6f 100644
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -445,10 +445,17 @@ event::ptr primitive_inst::realloc_if_needed() {
     if (_node->is_type<input_layout>())
         return ev;
 
+    // read_value/assign nodes are supposed to always use variable memory
     if (auto stateful_prim = dynamic_cast<memory_state::variable*>(this)) {
         std::string variable_id = stateful_prim->variable_id();
         auto variable = get_network().get_variable(variable_id);
         variable.set_layout(actual_layout);
+        GPU_DEBUG_TRACE_DETAIL << id() << ": use variable memory " << variable.get_memory()
+                               << " (size=" << variable.get_memory()->size() << ")" << std::endl;
+        // For nodes that can be optimized, variable memory is used as output memory
+        // so there is no need for output memory reallocation
+        if (can_be_optimized())
+            return ev;
     }
 
     // Update output layout with respect to FC's fake alignment