From 71e7015d39c52339bacf146a6c1834ca43283277 Mon Sep 17 00:00:00 2001 From: Andrew Kwangwoong Park Date: Wed, 13 Dec 2023 19:06:30 +0900 Subject: [PATCH] [GPU] Reduce host time overhead in ReadValue execution for stateful model (#21521) --- .../graph_optimizer/prepare_buffer_fusing.cpp | 18 ++++++++++++++++-- .../intel_gpu/src/graph/primitive_inst.cpp | 7 +++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index 95e6079f48f..db67b47cb71 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -14,6 +14,7 @@ #include "resample_inst.h" #include "loop_inst.h" #include "strided_slice_inst.h" +#include "shape_of_inst.h" #include "non_max_suppression_inst.h" #include "experimental_detectron_roi_feature_extractor_inst.hpp" #include "border_inst.h" @@ -407,6 +408,19 @@ static bool can_crop_be_optimized_along_batch(const crop_node& node) { return false; } +static bool can_read_value_be_optimize(const read_value_node& node) { + if (node.get_users().size() == 1) + return true; + + const auto non_shape_of_users_count = std::count_if(node.get_users().begin(), node.get_users().end(), [](const program_node* user) { + return !user->is_type(); + }); + if (non_shape_of_users_count <= 1) + return true; + + return false; +} + static void propagate_padding_to_opt_out_users(program_node& node, cldnn::padding padding_data) { if (padding_data == cldnn::padding()) return; @@ -632,10 +646,10 @@ void prepare_buffer_fusing::run(program& p) { // ┌────┴──────┐ // │ Result │ // └───────────┘ - // If read_value here returns virable memory w/o copy, then based on Add-s and Assign execution order we may have different results + // If read_value here returns variable memory w/o copy, then based on Add-s and Assign execution order we may have different results // TODO: Allow optimizations for the case above too. Looks like it can be achieved by more careful // topological sort (i.e. if we ensure that all read_value users are completed before assign is run) - node.can_be_optimized(node.get_users().size() == 1); + node.can_be_optimized(can_read_value_be_optimize(node)); }); } } diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 8c3b430e3ef..cf4b84fee6f 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -445,10 +445,17 @@ event::ptr primitive_inst::realloc_if_needed() { if (_node->is_type()) return ev; + // read_value/assign nodes are supposed to always use variable memory if (auto stateful_prim = dynamic_cast(this)) { std::string variable_id = stateful_prim->variable_id(); auto variable = get_network().get_variable(variable_id); variable.set_layout(actual_layout); + GPU_DEBUG_TRACE_DETAIL << id() << ": use variable memory " << variable.get_memory() + << " (size=" << variable.get_memory()->size() << ")" << std::endl; + // For nodes that can be optimized, variable memory is used as output memory + // so there is no need for output memory reallocation + if (can_be_optimized()) + return ev; } // Update output layout with respect to FC's fake alignment