[GPU] Fix multistream issue for dynamic shape (#13433) (#13433)

- Separate kernels_cache::add_kernel from factory (choose_impl)
- Reset kenrels_cache._kernels after kernels_cache.compile
- Create cldnn unit test case to check multi-stream processing
This commit is contained in:
Paul Youngsoo Ahn 2022-11-09 10:34:40 +09:00 committed by GitHub
parent 9943edfa34
commit 5c17c7e0a0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 209 additions and 46 deletions

View File

@ -11,6 +11,7 @@
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/runtime/event.hpp"
#include "intel_gpu/runtime/stream.hpp"
#include "intel_gpu/runtime/lru_cache.hpp"
#include <map>
#include <vector>
@ -216,6 +217,15 @@ public:
/// Returns memory state @p variable_id of stateful network
VariableState& get_variable_memory(const std::string &variable_id);
/// Return kernels_cache
kernels_cache& get_kernels_cache() const { return *_kernels_cache; }
/// Return implentations_cache
ImplementationsCache& get_implementations_cache() const { return *_impls_cache; }
/// Return in_mem_kernels_cache
KernelsCache& get_in_mem_kernels_cache() const { return *_in_mem_kernels_cache; }
private:
using output_chains_map = std::map<primitive_id, std::vector<std::shared_ptr<primitive_inst>>>;
uint32_t net_id = 0;
@ -249,5 +259,13 @@ private:
void check_names();
void add_default_output_chains();
output_chains_map::iterator add_output_chain(std::shared_ptr<primitive_inst>& p_inst);
std::unique_ptr<kernels_cache> _kernels_cache;
// Move from cldnn::program to cldnn::network for multi-threads issue.
std::unique_ptr<ImplementationsCache> _impls_cache;
std::unique_ptr<KernelsCache> _in_mem_kernels_cache;
// TODO: initial version use unlimited caches. Need to adjust it once dynamic flow works on wide set of models.
const size_t _impls_cache_capacity = 0;
const size_t _in_mem_kernels_cache_capacity = 0;
};
} // namespace cldnn

View File

@ -247,8 +247,6 @@ public:
void load_tuning_cache();
std::shared_ptr<kernel_selector::TuningCache> get_tuning_cache() const { return tuning_cache; }
ImplementationsCache& get_implementations_cache() const { return *_impls_cache; }
KernelsCache& get_in_mem_kernels_cache() const { return *_in_mem_kernels_cache; }
// returns {-1, -1} if it failed to estimate by allocating given batch size
std::pair<int64_t/*const alloc*/, int64_t/*general alloc*/> get_estimated_device_mem_usage();
@ -261,11 +259,6 @@ private:
stream::ptr _stream;
// TODO: Consider moving it to engine
std::unique_ptr<kernels_cache> _kernels_cache;
std::unique_ptr<ImplementationsCache> _impls_cache;
std::unique_ptr<KernelsCache> _in_mem_kernels_cache;
// TODO: initial version use unlimited caches. Need to adjust it once dynamic flow works on wide set of models.
const size_t _impls_cache_capacity = 0;
const size_t _in_mem_kernels_cache_capacity = 0;
build_options options;
std::list<program_node*> inputs;
std::vector<program_node*> outputs;

View File

@ -34,9 +34,13 @@ void compile_graph::run(program& p) {
for (size_t idx = 0; idx < proc_order.size(); idx++) {
auto& node = *(std::next(proc_order.begin(), idx));
if (!node->is_type<data>() && !(node->is_type<mutable_data>() && node->get_dependencies().empty()) && !node->is_dynamic()) {
tasks.push_back([node, &exception] {
tasks.push_back([node, &p, &exception] {
try {
node->selected_impl = node->type()->choose_impl(*node);
if (node->selected_impl) {
auto kernel_ids = p.get_kernels_cache().add_kernels_source(node->selected_impl->get_kernels_source());
node->selected_impl->set_kernel_ids(kernel_ids);
}
} catch(...) {
exception = std::current_exception();
}

View File

@ -66,6 +66,10 @@ void post_input_reorder::run(program& p) {
reorder.get_output_layout(false);
node->set_output_layout(previous_layout, false);
reorder.set_selected_impl(reorder.type()->choose_impl(reorder));
if (auto impl = reorder.get_selected_impl()) {
auto kernel_ids = p.get_kernels_cache().add_kernels_source(impl->get_kernels_source());
impl->set_kernel_ids(kernel_ids);
}
}
}
}

View File

@ -53,8 +53,13 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
// Don't run impl selection to avoid double compilation of reorder kernels
// in main program and internal program for constant propagation
if (!g_node.is_constant())
g_node.selected_impl = g_node.type()->choose_impl(g_node);
if (!g_node.is_constant()) {
g_node.set_selected_impl(g_node.type()->choose_impl(g_node));
if (auto impl = g_node.get_selected_impl()) {
auto kernel_ids = p.get_kernels_cache().add_kernels_source(impl->get_kernels_source());
impl->set_kernel_ids(kernel_ids);
}
}
}
}

View File

@ -36,8 +36,11 @@ void remove_redundant_reorders::run(program& p) {
return;
node.set_unique_id();
auto new_impl = node.type()->choose_impl(node);
node.set_selected_impl(std::move(new_impl));
node.set_selected_impl(node.type()->choose_impl(node));
if (auto impl = node.get_selected_impl()) {
auto kernel_ids = p.get_kernels_cache().add_kernels_source(impl->get_kernels_source());
impl->set_kernel_ids(kernel_ids);
}
};
// Fuse reorders into primitives

View File

@ -50,12 +50,6 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
_kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE;
_kernel_data.weightsReorderParams.cpuKernel = nullptr;
_kernel_data.weightsReorderParams.clKernel = nullptr;
_kernel_ids.reserve(kd.kernels.size());
// Add selected kernels to kernels_cache for the following compilation and save output ids
for (size_t i = 0; i < kd.kernels.size(); ++i) {
_kernel_ids.emplace_back(arg.get_program().add_kernel(kd.kernels[i].code.kernelString));
}
}
bool is_cpu() const override { return false; }
@ -198,6 +192,18 @@ protected:
bool group_events = (all_events.size() > 1);
return aggregate_events(all_events, stream, group_events);
}
void set_kernel_ids(std::vector<kernel_id> kernel_ids) override {
_kernel_ids = kernel_ids;
}
std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() override {
std::vector<std::shared_ptr<cldnn::kernel_string>> kernel_strings;
for (size_t i = 0; i < _kernel_data.kernels.size(); ++i) {
kernel_strings.push_back(_kernel_data.kernels[i].code.kernelString);
}
return kernel_strings;
}
};
} // namespace ocl

View File

@ -54,6 +54,9 @@ struct primitive_impl {
virtual std::vector<std::string> get_kernel_ids() {
return {};
}
virtual std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() { return {}; }
virtual void set_kernels(std::vector<kernel::ptr>) {}
virtual void set_kernel_ids(std::vector<kernel_id> kernel_ids) {}
// If this flag is set as false, the memory allocated for this primitive is not allowed to be reused
bool can_reuse_memory = true;

View File

@ -31,6 +31,7 @@
#include "kernel_selector_helper.h"
#include "program_helpers.h"
#include "runtime/cldnn_itt.hpp"
#include "kernels_cache.hpp"
#include <algorithm>
#include <string>
@ -293,6 +294,13 @@ network::network(program::ptr program, stream::ptr stream, bool is_internal, boo
build_exec_order();
validate_primitives();
add_default_output_chains();
if (is_dynamic()) {
_kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(program->get_engine(), program->get_id(),
kernel_selector::KernelBase::get_db().get_batch_header_str()));
_impls_cache = std::unique_ptr<ImplementationsCache>(new ImplementationsCache(_impls_cache_capacity));
_in_mem_kernels_cache = std::unique_ptr<KernelsCache>(new KernelsCache(_in_mem_kernels_cache_capacity));
}
}
network::network(engine& engine,

View File

@ -286,17 +286,19 @@ void primitive_inst::update_impl() {
};
auto layout_key = get_layout_key();
auto& cache = _network.get_program()->get_implementations_cache();
auto& cache = get_network().get_implementations_cache();
if (cache.has(layout_key)) {
_impl = cache.get(layout_key)->clone();
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true);
} else {
auto lru = cache.get_lru_element();
_impl = _node->type()->choose_impl(*_node, *_impl_params);
_network.get_program()->compile();
_impl->init_kernels(_network.get_program()->get_kernels_cache());
auto& kernels_cache = get_network().get_kernels_cache();
auto kernel_ids = kernels_cache.add_kernels_source(_impl->get_kernels_source());
_impl->set_kernel_ids(kernel_ids);
kernels_cache.compile();
_impl->init_kernels(kernels_cache);
cache.add(layout_key, _impl->clone());
_network.get_program()->get_kernels_cache().reset();
kernels_cache.reset();
}
reset_shape_change();
@ -560,8 +562,6 @@ event::ptr primitive_inst::update_weights() {
auto original_weights_memory = dep_memory_ptr(weights_idx);
auto original_layout = original_weights_memory->get_layout();
layout expected_layout = from_weights_tensor(weights_params.dest);
auto& program = _node->get_program();
auto& engine = _network.get_engine();
auto get_layout_key = [&]() -> std::string {
@ -574,7 +574,7 @@ event::ptr primitive_inst::update_weights() {
cldnn::kernel::ptr kernel = nullptr;
auto layout_key = get_layout_key();
if (layout_key != "") {
auto& cache = program.get_in_mem_kernels_cache();
auto& cache = get_network().get_in_mem_kernels_cache();
if (cache.has(layout_key)) {
GPU_DEBUG_IF(debug_config->verbose >= 4) {
GPU_DEBUG_COUT << id() << ": reorder weights (cached) from " << original_layout << "\nto " << expected_layout << std::endl;
@ -585,14 +585,16 @@ event::ptr primitive_inst::update_weights() {
GPU_DEBUG_IF(debug_config->verbose >= 4) {
GPU_DEBUG_COUT << id() << ": reorder weights from " << original_layout << "\nto " << expected_layout << std::endl;
}
auto _kernel_id = program.add_kernel(weights_params.clKernel->code.kernelString);
program.compile();
kernel = program.get_kernel(_kernel_id);
auto& kernels_cache = get_network().get_kernels_cache();
auto kernel_id = kernels_cache.set_kernel_source(weights_params.clKernel->code.kernelString, false);
kernels_cache.compile();
kernel = kernels_cache.get_kernel(kernel_id);
cache.add(layout_key, kernel);
kernels_cache.reset();
}
}
auto& stream = _network.get_stream();
auto& stream = get_network().get_stream();
bool can_reuse = _impl_params->reordered_weights != nullptr && _impl_params->reordered_weights->size() <= expected_layout.bytes_count();
if (can_reuse) {

View File

@ -114,8 +114,6 @@ program::program(engine& engine_ref,
prepare_nodes(topology);
_kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, prog_id,
kernel_selector::KernelBase::get_db().get_batch_header_str()));
_impls_cache = std::unique_ptr<ImplementationsCache>(new ImplementationsCache(_impls_cache_capacity));
_in_mem_kernels_cache = std::unique_ptr<KernelsCache>(new KernelsCache(_in_mem_kernels_cache_capacity));
program_node::reset_unique_id();
if (no_optimizations) {
init_graph();
@ -137,8 +135,6 @@ program::program(engine& engine_ref,
set_options();
_kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, prog_id,
kernel_selector::KernelBase::get_db().get_batch_header_str()));
_impls_cache = std::unique_ptr<ImplementationsCache>(new ImplementationsCache(_impls_cache_capacity));
_in_mem_kernels_cache = std::unique_ptr<KernelsCache>(new KernelsCache(_in_mem_kernels_cache_capacity));
pm = std::unique_ptr<pass_manager>(new pass_manager(*this));
prepare_nodes(nodes);
build_program(is_internal);

View File

@ -148,18 +148,8 @@ kernels_cache::kernels_cache(engine& engine, uint32_t prog_id, const std::vector
kernel_id kernels_cache::set_kernel_source(
const std::shared_ptr<kernel_string>& kernel_string,
bool dump_custom_program) {
std::lock_guard<std::mutex> lock(_mutex);
// we need unique id in order to avoid conflict across topologies.
const auto kernel_num = _kernels.size() + (_kernel_idx++);
kernel_id id = kernel_string->entry_point + "_" + std::to_string(kernel_num);
auto res = _kernels_code.emplace(kernel_string, id, dump_custom_program);
assert(_kernels.find(id) == _kernels.end());
if (res.second) {
_pending_compilation = true;
}
return id;
auto kernel_ids = add_kernels_source({kernel_string}, dump_custom_program);
return kernel_ids[0];
}
static std::vector<unsigned char> getProgramBinaries(cl::Program program) {
@ -384,4 +374,54 @@ void kernels_cache::reset() {
_pending_compilation = false;
}
std::vector<kernel_id> kernels_cache::add_kernels_source(std::vector<std::shared_ptr<kernel_string>> kernel_sources, bool dump_custom_program) {
std::vector<kernel_id> kernel_ids;
kernel_ids.reserve(kernel_sources.size());
for (size_t i = 0; i < kernel_sources.size(); ++i) {
std::lock_guard<std::mutex> lock(_mutex);
auto kernel_string = kernel_sources[i];
// we need unique id in order to avoid conflict across topologies.
const auto kernel_num = _kernels.size() + (_kernel_idx++);
kernel_id id = kernel_string->entry_point + "_" + std::to_string(kernel_num);
auto res = _kernels_code.emplace(kernel_string, id, dump_custom_program);
assert(_kernels.find(id) == _kernels.end());
if (res.second) {
_pending_compilation = true;
}
kernel_ids.emplace_back(id);
}
return kernel_ids;
}
void kernels_cache::compile() {
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildAll");
std::unique_ptr<ocl::ocl_engine> _build_engine = nullptr;
if (_engine.type() == engine_types::ocl) {
_build_engine = std::unique_ptr<ocl::ocl_engine>(new ocl::ocl_engine(_engine.get_device(), runtime_types::ocl,
_engine.configuration(), _engine.get_task_executor()));
}
// create batches
std::vector<batch_program> batches;
get_program_source(_kernels_code, &batches);
// build batches
for (size_t idx = 0; idx < batches.size(); idx++) {
build_batch(*_build_engine, batches[idx]);
}
_kernels_code.clear();
_pending_compilation = false;
#if defined(__unix__) && !defined(__ANDROID__)
// NOTE: In linux, without malloc_trim, an amount of the memory used by compilation is not being returned to system thought they are freed.
// (It is at least 500 MB when we perform parallel compilation)
// It is observed that freeing the memory manually with malloc_trim saves significant amount of the memory.
// Also, this is not happening in Windows.
// So, added malloc_trim for linux build until we figure out a better solution.
malloc_trim(0);
#endif
}
} // namespace cldnn

View File

@ -101,6 +101,8 @@ public:
void remove_kernel(kernel_id id) {
_kernels.erase(id);
}
std::vector<kernel_id> add_kernels_source(std::vector<std::shared_ptr<kernel_string>> kernel_sources, bool dump_custom_program = false);
void compile();
};
} // namespace cldnn

View File

@ -0,0 +1,79 @@
// Copyright (C) 2022-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
///////////////////////////////////////////////////////////////////////////////////////////////////
#include "test_utils.h"
#include <intel_gpu/primitives/input_layout.hpp>
#include <intel_gpu/primitives/shape_of.hpp>
#include <intel_gpu/primitives/eltwise.hpp>
#include <intel_gpu/primitives/fully_connected.hpp>
#include <intel_gpu/primitives/data.hpp>
#include <vector>
#include <iostream>
using namespace cldnn;
using namespace ::tests;
TEST(multistream_gpu, basic) {
const int num_streams = 2;
auto config = InferenceEngine::CPUStreamsExecutor::Config();
config._streams = num_streams;
auto task_executor = std::make_shared<InferenceEngine::CPUStreamsExecutor>(config);
auto& engine = get_test_engine();
build_options bo;
bo.set_option(build_option::allow_new_shape_infer(true));
auto input1_dyn_layout = layout{ ov::PartialShape::dynamic(3), data_types::f16,format::bfyx };
auto input2_dyn_layout = layout{ ov::PartialShape::dynamic(3), data_types::f16,format::bfyx };
auto weights = engine.allocate_memory({ {512, 512}, data_types::f32, format::bfyx});
topology topology;
topology.add(input_layout("input1", input1_dyn_layout));
topology.add(input_layout("input2", input2_dyn_layout));
topology.add(data("weights", weights));
topology.add(eltwise("eltwise", "input1", "input2", eltwise_mode::sum));
topology.add(fully_connected("fc", "eltwise", "weights"));
topology.add(shape_of("shape_of", "fc", 3, data_types::i32));
auto prog_ptr = program::build_program(engine, topology, bo);
std::vector<network::ptr> networks;
for (size_t i = 0; i < num_streams; i++) {
networks.push_back(network::allocate_network(engine, prog_ptr));
}
std::vector<InferenceEngine::Task> tasks;
for (size_t i = 0; i < num_streams; i++) {
tasks.push_back([&networks, i, &engine] {
auto net = networks[i];
std::vector<int> various_size = {32, 128, 16, 64};
for (size_t iter = 0; iter < 8; iter++) {
int len = various_size[iter % various_size.size()];
auto input1_mem = engine.allocate_memory({ ov::PartialShape{1,len,512}, data_types::f16,format::bfyx });
auto input2_mem = engine.allocate_memory({ ov::PartialShape{1,len,512}, data_types::f16,format::bfyx });
net->set_input_data("input1", input1_mem);
net->set_input_data("input2", input2_mem);
auto outputs = net->execute();
auto output = outputs.at("shape_of").get_memory();
cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
std::vector<int32_t> expected_results = {1, len, 512};
for (size_t out_idx = 0; out_idx < expected_results.size(); ++out_idx) {
EXPECT_TRUE(are_equal(expected_results[out_idx], output_ptr[out_idx]));
}
}
});
}
task_executor->runAndWait(tasks);
tasks.clear();
networks.clear();
}