- Separate kernels_cache::add_kernel from factory (choose_impl) - Reset kenrels_cache._kernels after kernels_cache.compile - Create cldnn unit test case to check multi-stream processing
This commit is contained in:
parent
9943edfa34
commit
5c17c7e0a0
@ -11,6 +11,7 @@
|
||||
#include "intel_gpu/runtime/engine.hpp"
|
||||
#include "intel_gpu/runtime/event.hpp"
|
||||
#include "intel_gpu/runtime/stream.hpp"
|
||||
#include "intel_gpu/runtime/lru_cache.hpp"
|
||||
|
||||
#include <map>
|
||||
#include <vector>
|
||||
@ -216,6 +217,15 @@ public:
|
||||
/// Returns memory state @p variable_id of stateful network
|
||||
VariableState& get_variable_memory(const std::string &variable_id);
|
||||
|
||||
/// Return kernels_cache
|
||||
kernels_cache& get_kernels_cache() const { return *_kernels_cache; }
|
||||
|
||||
/// Return implentations_cache
|
||||
ImplementationsCache& get_implementations_cache() const { return *_impls_cache; }
|
||||
|
||||
/// Return in_mem_kernels_cache
|
||||
KernelsCache& get_in_mem_kernels_cache() const { return *_in_mem_kernels_cache; }
|
||||
|
||||
private:
|
||||
using output_chains_map = std::map<primitive_id, std::vector<std::shared_ptr<primitive_inst>>>;
|
||||
uint32_t net_id = 0;
|
||||
@ -249,5 +259,13 @@ private:
|
||||
void check_names();
|
||||
void add_default_output_chains();
|
||||
output_chains_map::iterator add_output_chain(std::shared_ptr<primitive_inst>& p_inst);
|
||||
|
||||
std::unique_ptr<kernels_cache> _kernels_cache;
|
||||
// Move from cldnn::program to cldnn::network for multi-threads issue.
|
||||
std::unique_ptr<ImplementationsCache> _impls_cache;
|
||||
std::unique_ptr<KernelsCache> _in_mem_kernels_cache;
|
||||
// TODO: initial version use unlimited caches. Need to adjust it once dynamic flow works on wide set of models.
|
||||
const size_t _impls_cache_capacity = 0;
|
||||
const size_t _in_mem_kernels_cache_capacity = 0;
|
||||
};
|
||||
} // namespace cldnn
|
||||
|
@ -247,8 +247,6 @@ public:
|
||||
|
||||
void load_tuning_cache();
|
||||
std::shared_ptr<kernel_selector::TuningCache> get_tuning_cache() const { return tuning_cache; }
|
||||
ImplementationsCache& get_implementations_cache() const { return *_impls_cache; }
|
||||
KernelsCache& get_in_mem_kernels_cache() const { return *_in_mem_kernels_cache; }
|
||||
|
||||
// returns {-1, -1} if it failed to estimate by allocating given batch size
|
||||
std::pair<int64_t/*const alloc*/, int64_t/*general alloc*/> get_estimated_device_mem_usage();
|
||||
@ -261,11 +259,6 @@ private:
|
||||
stream::ptr _stream;
|
||||
// TODO: Consider moving it to engine
|
||||
std::unique_ptr<kernels_cache> _kernels_cache;
|
||||
std::unique_ptr<ImplementationsCache> _impls_cache;
|
||||
std::unique_ptr<KernelsCache> _in_mem_kernels_cache;
|
||||
// TODO: initial version use unlimited caches. Need to adjust it once dynamic flow works on wide set of models.
|
||||
const size_t _impls_cache_capacity = 0;
|
||||
const size_t _in_mem_kernels_cache_capacity = 0;
|
||||
build_options options;
|
||||
std::list<program_node*> inputs;
|
||||
std::vector<program_node*> outputs;
|
||||
|
@ -34,9 +34,13 @@ void compile_graph::run(program& p) {
|
||||
for (size_t idx = 0; idx < proc_order.size(); idx++) {
|
||||
auto& node = *(std::next(proc_order.begin(), idx));
|
||||
if (!node->is_type<data>() && !(node->is_type<mutable_data>() && node->get_dependencies().empty()) && !node->is_dynamic()) {
|
||||
tasks.push_back([node, &exception] {
|
||||
tasks.push_back([node, &p, &exception] {
|
||||
try {
|
||||
node->selected_impl = node->type()->choose_impl(*node);
|
||||
if (node->selected_impl) {
|
||||
auto kernel_ids = p.get_kernels_cache().add_kernels_source(node->selected_impl->get_kernels_source());
|
||||
node->selected_impl->set_kernel_ids(kernel_ids);
|
||||
}
|
||||
} catch(...) {
|
||||
exception = std::current_exception();
|
||||
}
|
||||
|
@ -66,6 +66,10 @@ void post_input_reorder::run(program& p) {
|
||||
reorder.get_output_layout(false);
|
||||
node->set_output_layout(previous_layout, false);
|
||||
reorder.set_selected_impl(reorder.type()->choose_impl(reorder));
|
||||
if (auto impl = reorder.get_selected_impl()) {
|
||||
auto kernel_ids = p.get_kernels_cache().add_kernels_source(impl->get_kernels_source());
|
||||
impl->set_kernel_ids(kernel_ids);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -53,8 +53,13 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
|
||||
|
||||
// Don't run impl selection to avoid double compilation of reorder kernels
|
||||
// in main program and internal program for constant propagation
|
||||
if (!g_node.is_constant())
|
||||
g_node.selected_impl = g_node.type()->choose_impl(g_node);
|
||||
if (!g_node.is_constant()) {
|
||||
g_node.set_selected_impl(g_node.type()->choose_impl(g_node));
|
||||
if (auto impl = g_node.get_selected_impl()) {
|
||||
auto kernel_ids = p.get_kernels_cache().add_kernels_source(impl->get_kernels_source());
|
||||
impl->set_kernel_ids(kernel_ids);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -36,8 +36,11 @@ void remove_redundant_reorders::run(program& p) {
|
||||
return;
|
||||
|
||||
node.set_unique_id();
|
||||
auto new_impl = node.type()->choose_impl(node);
|
||||
node.set_selected_impl(std::move(new_impl));
|
||||
node.set_selected_impl(node.type()->choose_impl(node));
|
||||
if (auto impl = node.get_selected_impl()) {
|
||||
auto kernel_ids = p.get_kernels_cache().add_kernels_source(impl->get_kernels_source());
|
||||
impl->set_kernel_ids(kernel_ids);
|
||||
}
|
||||
};
|
||||
|
||||
// Fuse reorders into primitives
|
||||
|
@ -50,12 +50,6 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
|
||||
_kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE;
|
||||
_kernel_data.weightsReorderParams.cpuKernel = nullptr;
|
||||
_kernel_data.weightsReorderParams.clKernel = nullptr;
|
||||
|
||||
_kernel_ids.reserve(kd.kernels.size());
|
||||
// Add selected kernels to kernels_cache for the following compilation and save output ids
|
||||
for (size_t i = 0; i < kd.kernels.size(); ++i) {
|
||||
_kernel_ids.emplace_back(arg.get_program().add_kernel(kd.kernels[i].code.kernelString));
|
||||
}
|
||||
}
|
||||
|
||||
bool is_cpu() const override { return false; }
|
||||
@ -198,6 +192,18 @@ protected:
|
||||
bool group_events = (all_events.size() > 1);
|
||||
return aggregate_events(all_events, stream, group_events);
|
||||
}
|
||||
|
||||
void set_kernel_ids(std::vector<kernel_id> kernel_ids) override {
|
||||
_kernel_ids = kernel_ids;
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() override {
|
||||
std::vector<std::shared_ptr<cldnn::kernel_string>> kernel_strings;
|
||||
for (size_t i = 0; i < _kernel_data.kernels.size(); ++i) {
|
||||
kernel_strings.push_back(_kernel_data.kernels[i].code.kernelString);
|
||||
}
|
||||
return kernel_strings;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ocl
|
||||
|
@ -54,6 +54,9 @@ struct primitive_impl {
|
||||
virtual std::vector<std::string> get_kernel_ids() {
|
||||
return {};
|
||||
}
|
||||
virtual std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() { return {}; }
|
||||
virtual void set_kernels(std::vector<kernel::ptr>) {}
|
||||
virtual void set_kernel_ids(std::vector<kernel_id> kernel_ids) {}
|
||||
|
||||
// If this flag is set as false, the memory allocated for this primitive is not allowed to be reused
|
||||
bool can_reuse_memory = true;
|
||||
|
@ -31,6 +31,7 @@
|
||||
#include "kernel_selector_helper.h"
|
||||
#include "program_helpers.h"
|
||||
#include "runtime/cldnn_itt.hpp"
|
||||
#include "kernels_cache.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
@ -293,6 +294,13 @@ network::network(program::ptr program, stream::ptr stream, bool is_internal, boo
|
||||
build_exec_order();
|
||||
validate_primitives();
|
||||
add_default_output_chains();
|
||||
|
||||
if (is_dynamic()) {
|
||||
_kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(program->get_engine(), program->get_id(),
|
||||
kernel_selector::KernelBase::get_db().get_batch_header_str()));
|
||||
_impls_cache = std::unique_ptr<ImplementationsCache>(new ImplementationsCache(_impls_cache_capacity));
|
||||
_in_mem_kernels_cache = std::unique_ptr<KernelsCache>(new KernelsCache(_in_mem_kernels_cache_capacity));
|
||||
}
|
||||
}
|
||||
|
||||
network::network(engine& engine,
|
||||
|
@ -286,17 +286,19 @@ void primitive_inst::update_impl() {
|
||||
};
|
||||
|
||||
auto layout_key = get_layout_key();
|
||||
auto& cache = _network.get_program()->get_implementations_cache();
|
||||
auto& cache = get_network().get_implementations_cache();
|
||||
if (cache.has(layout_key)) {
|
||||
_impl = cache.get(layout_key)->clone();
|
||||
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true);
|
||||
} else {
|
||||
auto lru = cache.get_lru_element();
|
||||
_impl = _node->type()->choose_impl(*_node, *_impl_params);
|
||||
_network.get_program()->compile();
|
||||
_impl->init_kernels(_network.get_program()->get_kernels_cache());
|
||||
auto& kernels_cache = get_network().get_kernels_cache();
|
||||
auto kernel_ids = kernels_cache.add_kernels_source(_impl->get_kernels_source());
|
||||
_impl->set_kernel_ids(kernel_ids);
|
||||
kernels_cache.compile();
|
||||
_impl->init_kernels(kernels_cache);
|
||||
cache.add(layout_key, _impl->clone());
|
||||
_network.get_program()->get_kernels_cache().reset();
|
||||
kernels_cache.reset();
|
||||
}
|
||||
|
||||
reset_shape_change();
|
||||
@ -560,8 +562,6 @@ event::ptr primitive_inst::update_weights() {
|
||||
auto original_weights_memory = dep_memory_ptr(weights_idx);
|
||||
auto original_layout = original_weights_memory->get_layout();
|
||||
layout expected_layout = from_weights_tensor(weights_params.dest);
|
||||
|
||||
auto& program = _node->get_program();
|
||||
auto& engine = _network.get_engine();
|
||||
|
||||
auto get_layout_key = [&]() -> std::string {
|
||||
@ -574,7 +574,7 @@ event::ptr primitive_inst::update_weights() {
|
||||
cldnn::kernel::ptr kernel = nullptr;
|
||||
auto layout_key = get_layout_key();
|
||||
if (layout_key != "") {
|
||||
auto& cache = program.get_in_mem_kernels_cache();
|
||||
auto& cache = get_network().get_in_mem_kernels_cache();
|
||||
if (cache.has(layout_key)) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << id() << ": reorder weights (cached) from " << original_layout << "\nto " << expected_layout << std::endl;
|
||||
@ -585,14 +585,16 @@ event::ptr primitive_inst::update_weights() {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << id() << ": reorder weights from " << original_layout << "\nto " << expected_layout << std::endl;
|
||||
}
|
||||
auto _kernel_id = program.add_kernel(weights_params.clKernel->code.kernelString);
|
||||
program.compile();
|
||||
kernel = program.get_kernel(_kernel_id);
|
||||
auto& kernels_cache = get_network().get_kernels_cache();
|
||||
auto kernel_id = kernels_cache.set_kernel_source(weights_params.clKernel->code.kernelString, false);
|
||||
kernels_cache.compile();
|
||||
kernel = kernels_cache.get_kernel(kernel_id);
|
||||
cache.add(layout_key, kernel);
|
||||
kernels_cache.reset();
|
||||
}
|
||||
}
|
||||
|
||||
auto& stream = _network.get_stream();
|
||||
auto& stream = get_network().get_stream();
|
||||
|
||||
bool can_reuse = _impl_params->reordered_weights != nullptr && _impl_params->reordered_weights->size() <= expected_layout.bytes_count();
|
||||
if (can_reuse) {
|
||||
|
@ -114,8 +114,6 @@ program::program(engine& engine_ref,
|
||||
prepare_nodes(topology);
|
||||
_kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, prog_id,
|
||||
kernel_selector::KernelBase::get_db().get_batch_header_str()));
|
||||
_impls_cache = std::unique_ptr<ImplementationsCache>(new ImplementationsCache(_impls_cache_capacity));
|
||||
_in_mem_kernels_cache = std::unique_ptr<KernelsCache>(new KernelsCache(_in_mem_kernels_cache_capacity));
|
||||
program_node::reset_unique_id();
|
||||
if (no_optimizations) {
|
||||
init_graph();
|
||||
@ -137,8 +135,6 @@ program::program(engine& engine_ref,
|
||||
set_options();
|
||||
_kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, prog_id,
|
||||
kernel_selector::KernelBase::get_db().get_batch_header_str()));
|
||||
_impls_cache = std::unique_ptr<ImplementationsCache>(new ImplementationsCache(_impls_cache_capacity));
|
||||
_in_mem_kernels_cache = std::unique_ptr<KernelsCache>(new KernelsCache(_in_mem_kernels_cache_capacity));
|
||||
pm = std::unique_ptr<pass_manager>(new pass_manager(*this));
|
||||
prepare_nodes(nodes);
|
||||
build_program(is_internal);
|
||||
|
@ -148,18 +148,8 @@ kernels_cache::kernels_cache(engine& engine, uint32_t prog_id, const std::vector
|
||||
kernel_id kernels_cache::set_kernel_source(
|
||||
const std::shared_ptr<kernel_string>& kernel_string,
|
||||
bool dump_custom_program) {
|
||||
std::lock_guard<std::mutex> lock(_mutex);
|
||||
// we need unique id in order to avoid conflict across topologies.
|
||||
const auto kernel_num = _kernels.size() + (_kernel_idx++);
|
||||
kernel_id id = kernel_string->entry_point + "_" + std::to_string(kernel_num);
|
||||
|
||||
auto res = _kernels_code.emplace(kernel_string, id, dump_custom_program);
|
||||
|
||||
assert(_kernels.find(id) == _kernels.end());
|
||||
if (res.second) {
|
||||
_pending_compilation = true;
|
||||
}
|
||||
return id;
|
||||
auto kernel_ids = add_kernels_source({kernel_string}, dump_custom_program);
|
||||
return kernel_ids[0];
|
||||
}
|
||||
|
||||
static std::vector<unsigned char> getProgramBinaries(cl::Program program) {
|
||||
@ -384,4 +374,54 @@ void kernels_cache::reset() {
|
||||
_pending_compilation = false;
|
||||
}
|
||||
|
||||
std::vector<kernel_id> kernels_cache::add_kernels_source(std::vector<std::shared_ptr<kernel_string>> kernel_sources, bool dump_custom_program) {
|
||||
std::vector<kernel_id> kernel_ids;
|
||||
kernel_ids.reserve(kernel_sources.size());
|
||||
for (size_t i = 0; i < kernel_sources.size(); ++i) {
|
||||
std::lock_guard<std::mutex> lock(_mutex);
|
||||
auto kernel_string = kernel_sources[i];
|
||||
// we need unique id in order to avoid conflict across topologies.
|
||||
const auto kernel_num = _kernels.size() + (_kernel_idx++);
|
||||
kernel_id id = kernel_string->entry_point + "_" + std::to_string(kernel_num);
|
||||
|
||||
auto res = _kernels_code.emplace(kernel_string, id, dump_custom_program);
|
||||
|
||||
assert(_kernels.find(id) == _kernels.end());
|
||||
if (res.second) {
|
||||
_pending_compilation = true;
|
||||
}
|
||||
kernel_ids.emplace_back(id);
|
||||
}
|
||||
return kernel_ids;
|
||||
}
|
||||
|
||||
void kernels_cache::compile() {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildAll");
|
||||
|
||||
std::unique_ptr<ocl::ocl_engine> _build_engine = nullptr;
|
||||
if (_engine.type() == engine_types::ocl) {
|
||||
_build_engine = std::unique_ptr<ocl::ocl_engine>(new ocl::ocl_engine(_engine.get_device(), runtime_types::ocl,
|
||||
_engine.configuration(), _engine.get_task_executor()));
|
||||
}
|
||||
|
||||
// create batches
|
||||
std::vector<batch_program> batches;
|
||||
get_program_source(_kernels_code, &batches);
|
||||
|
||||
// build batches
|
||||
for (size_t idx = 0; idx < batches.size(); idx++) {
|
||||
build_batch(*_build_engine, batches[idx]);
|
||||
}
|
||||
|
||||
_kernels_code.clear();
|
||||
_pending_compilation = false;
|
||||
#if defined(__unix__) && !defined(__ANDROID__)
|
||||
// NOTE: In linux, without malloc_trim, an amount of the memory used by compilation is not being returned to system thought they are freed.
|
||||
// (It is at least 500 MB when we perform parallel compilation)
|
||||
// It is observed that freeing the memory manually with malloc_trim saves significant amount of the memory.
|
||||
// Also, this is not happening in Windows.
|
||||
// So, added malloc_trim for linux build until we figure out a better solution.
|
||||
malloc_trim(0);
|
||||
#endif
|
||||
}
|
||||
} // namespace cldnn
|
||||
|
@ -101,6 +101,8 @@ public:
|
||||
void remove_kernel(kernel_id id) {
|
||||
_kernels.erase(id);
|
||||
}
|
||||
std::vector<kernel_id> add_kernels_source(std::vector<std::shared_ptr<kernel_string>> kernel_sources, bool dump_custom_program = false);
|
||||
void compile();
|
||||
};
|
||||
|
||||
} // namespace cldnn
|
||||
|
@ -0,0 +1,79 @@
|
||||
// Copyright (C) 2022-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "test_utils.h"
|
||||
|
||||
#include <intel_gpu/primitives/input_layout.hpp>
|
||||
#include <intel_gpu/primitives/shape_of.hpp>
|
||||
#include <intel_gpu/primitives/eltwise.hpp>
|
||||
#include <intel_gpu/primitives/fully_connected.hpp>
|
||||
#include <intel_gpu/primitives/data.hpp>
|
||||
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
using namespace cldnn;
|
||||
using namespace ::tests;
|
||||
|
||||
|
||||
TEST(multistream_gpu, basic) {
|
||||
const int num_streams = 2;
|
||||
auto config = InferenceEngine::CPUStreamsExecutor::Config();
|
||||
config._streams = num_streams;
|
||||
auto task_executor = std::make_shared<InferenceEngine::CPUStreamsExecutor>(config);
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
build_options bo;
|
||||
bo.set_option(build_option::allow_new_shape_infer(true));
|
||||
|
||||
auto input1_dyn_layout = layout{ ov::PartialShape::dynamic(3), data_types::f16,format::bfyx };
|
||||
auto input2_dyn_layout = layout{ ov::PartialShape::dynamic(3), data_types::f16,format::bfyx };
|
||||
auto weights = engine.allocate_memory({ {512, 512}, data_types::f32, format::bfyx});
|
||||
|
||||
topology topology;
|
||||
topology.add(input_layout("input1", input1_dyn_layout));
|
||||
topology.add(input_layout("input2", input2_dyn_layout));
|
||||
topology.add(data("weights", weights));
|
||||
topology.add(eltwise("eltwise", "input1", "input2", eltwise_mode::sum));
|
||||
topology.add(fully_connected("fc", "eltwise", "weights"));
|
||||
topology.add(shape_of("shape_of", "fc", 3, data_types::i32));
|
||||
|
||||
auto prog_ptr = program::build_program(engine, topology, bo);
|
||||
std::vector<network::ptr> networks;
|
||||
for (size_t i = 0; i < num_streams; i++) {
|
||||
networks.push_back(network::allocate_network(engine, prog_ptr));
|
||||
}
|
||||
|
||||
std::vector<InferenceEngine::Task> tasks;
|
||||
for (size_t i = 0; i < num_streams; i++) {
|
||||
tasks.push_back([&networks, i, &engine] {
|
||||
auto net = networks[i];
|
||||
std::vector<int> various_size = {32, 128, 16, 64};
|
||||
for (size_t iter = 0; iter < 8; iter++) {
|
||||
int len = various_size[iter % various_size.size()];
|
||||
auto input1_mem = engine.allocate_memory({ ov::PartialShape{1,len,512}, data_types::f16,format::bfyx });
|
||||
auto input2_mem = engine.allocate_memory({ ov::PartialShape{1,len,512}, data_types::f16,format::bfyx });
|
||||
net->set_input_data("input1", input1_mem);
|
||||
net->set_input_data("input2", input2_mem);
|
||||
|
||||
auto outputs = net->execute();
|
||||
|
||||
auto output = outputs.at("shape_of").get_memory();
|
||||
cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
|
||||
|
||||
std::vector<int32_t> expected_results = {1, len, 512};
|
||||
|
||||
for (size_t out_idx = 0; out_idx < expected_results.size(); ++out_idx) {
|
||||
EXPECT_TRUE(are_equal(expected_results[out_idx], output_ptr[out_idx]));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
task_executor->runAndWait(tasks);
|
||||
tasks.clear();
|
||||
networks.clear();
|
||||
}
|
Loading…
Reference in New Issue
Block a user