[GPU] Reduce host time overhead in ReadValue execution for stateful model (#21521)

This commit is contained in:
Andrew Kwangwoong Park 2023-12-13 19:06:30 +09:00 committed by GitHub
parent 3fb60dc41c
commit 71e7015d39
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 23 additions and 2 deletions

View File

@ -14,6 +14,7 @@
#include "resample_inst.h"
#include "loop_inst.h"
#include "strided_slice_inst.h"
#include "shape_of_inst.h"
#include "non_max_suppression_inst.h"
#include "experimental_detectron_roi_feature_extractor_inst.hpp"
#include "border_inst.h"
@ -407,6 +408,19 @@ static bool can_crop_be_optimized_along_batch(const crop_node& node) {
return false;
}
static bool can_read_value_be_optimize(const read_value_node& node) {
if (node.get_users().size() == 1)
return true;
const auto non_shape_of_users_count = std::count_if(node.get_users().begin(), node.get_users().end(), [](const program_node* user) {
return !user->is_type<shape_of>();
});
if (non_shape_of_users_count <= 1)
return true;
return false;
}
static void propagate_padding_to_opt_out_users(program_node& node, cldnn::padding padding_data) {
if (padding_data == cldnn::padding())
return;
@ -632,10 +646,10 @@ void prepare_buffer_fusing::run(program& p) {
// ┌────┴──────┐
// │ Result │
// └───────────┘
// If read_value here returns virable memory w/o copy, then based on Add-s and Assign execution order we may have different results
// If read_value here returns variable memory w/o copy, then based on Add-s and Assign execution order we may have different results
// TODO: Allow optimizations for the case above too. Looks like it can be achieved by more careful
// topological sort (i.e. if we ensure that all read_value users are completed before assign is run)
node.can_be_optimized(node.get_users().size() == 1);
node.can_be_optimized(can_read_value_be_optimize(node));
});
}
}

View File

@ -445,10 +445,17 @@ event::ptr primitive_inst::realloc_if_needed() {
if (_node->is_type<input_layout>())
return ev;
// read_value/assign nodes are supposed to always use variable memory
if (auto stateful_prim = dynamic_cast<memory_state::variable*>(this)) {
std::string variable_id = stateful_prim->variable_id();
auto variable = get_network().get_variable(variable_id);
variable.set_layout(actual_layout);
GPU_DEBUG_TRACE_DETAIL << id() << ": use variable memory " << variable.get_memory()
<< " (size=" << variable.get_memory()->size() << ")" << std::endl;
// For nodes that can be optimized, variable memory is used as output memory
// so there is no need for output memory reallocation
if (can_be_optimized())
return ev;
}
// Update output layout with respect to FC's fake alignment