[IE CLDNN] Optimize kernel cache memory usage in GPU plugin (#1233)

This commit is contained in:
Mikhail Letavin 2020-07-13 18:33:32 +03:00 committed by GitHub
parent 543559f58c
commit 91ec946865
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 36 additions and 37 deletions

View File

@ -105,12 +105,12 @@ kernels_cache::sorted_code kernels_cache::get_program_source(const kernels_code&
sorted_code scode;
for (const auto& code : kernels_source_code) {
const source_code org_source_code = {code.second.kernel_strings->jit, code.second.kernel_strings->str};
std::string entry_point = code.second.kernel_strings->entry_point;
std::string options = code.second.kernel_strings->options;
bool batch_compilation = code.second.kernel_strings->batch_compilation;
bool dump_custom_program = code.second.dump_custom_program;
bool one_time_kernel = code.second.one_time_kernel;
const source_code org_source_code = {code.kernel_strings->jit, code.kernel_strings->str};
std::string entry_point = code.kernel_strings->entry_point;
std::string options = code.kernel_strings->options;
bool batch_compilation = code.kernel_strings->batch_compilation;
bool dump_custom_program = code.dump_custom_program;
bool one_time_kernel = code.one_time_kernel;
batch_compilation &= does_options_support_batch_compilation(options);
@ -144,7 +144,7 @@ kernels_cache::sorted_code kernels_cache::get_program_source(const kernels_code&
current_bucket.source.push_back({});
}
current_bucket.entry_point_to_id[entry_point] = code.second.id;
current_bucket.entry_point_to_id[entry_point] = code.id;
source_code new_source_code = org_source_code;
@ -168,26 +168,18 @@ kernels_cache::kernel_id kernels_cache::set_kernel_source(
const std::shared_ptr<kernel_selector::kernel_string>& kernel_string,
bool dump_custom_program,
bool one_time_kernel) {
kernels_cache::kernel_id id;
// same kernel_string == same kernel
const auto key = kernel_string.get()->get_hash();
std::lock_guard<std::mutex> lock(_context.get_cache_mutex());
const auto it = _kernels_code.find(key);
// we need unique id in order to avoid conflict across topologies.
const auto kernel_num = _kernels.size() + _kernels_code.size();
kernels_cache::kernel_id id = kernel_string->entry_point + "_" + std::to_string(kernel_num);
if (it == _kernels_code.end()) {
// we need unique id in order to avoid conflict across topologies.
const auto kernel_num = _kernels.size() + _kernels_code.size();
id = kernel_string->entry_point + "_" + std::to_string(kernel_num);
_kernels_code[key] = {kernel_string, id, dump_custom_program, one_time_kernel};
} else {
id = it->second.id;
}
auto res = _kernels_code.emplace( kernel_string, id, dump_custom_program, one_time_kernel );
assert(_kernels.find(id) == _kernels.end());
_pending_compilation = true;
if (res.second) {
_pending_compilation = true;
}
return id;
}
@ -227,8 +219,6 @@ kernels_cache::kernels_map kernels_cache::build_program(const program_code& prog
try {
cl::Program program(_context.context(), sources);
program.build({_context.device()}, program_source.options.c_str());
// Store kernels for serialization process.
_context.store_binaries(program.getInfo<CL_PROGRAM_BINARIES>(), _prog_id);
if (dump_sources && dump_file.good()) {
dump_file << "\n/* Build Log:\n";
@ -240,7 +230,6 @@ kernels_cache::kernels_map kernels_cache::build_program(const program_code& prog
cl::vector<cl::Kernel> kernels;
program.createKernels(&kernels);
for (auto& k : kernels) {
auto kernel_name = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
kmap.emplace(kernel_name, kernels_cache::kernel_type(k, _context.get_device_info().supports_usm));

View File

@ -22,16 +22,14 @@
#include <memory>
#include <atomic>
#include <string>
#include <unordered_set>
#include <kernel_selector_common.h>
namespace cl {
class Kernel;
class KernelIntel;
}
namespace kernel_selector {
struct KernelString;
}
namespace kernel_selector {
using kernel_string = kernel_selector::KernelString;
}
@ -59,13 +57,32 @@ public:
std::string id;
bool dump_custom_program;
bool one_time_kernel;
kernel_code(const std::shared_ptr<kernel_selector::kernel_string>& _kernel_strings,
const std::string& _id,
bool _dump_custom_program,
bool _one_time_kernel)
: kernel_strings(_kernel_strings),
id(_id),
dump_custom_program(_dump_custom_program),
one_time_kernel(_one_time_kernel) {}
bool operator == (const kernel_code& c2) const {
return kernel_strings->get_hash() == c2.kernel_strings->get_hash();
};
};
struct hash_kernel_code {
size_t operator()(const kernel_code& x) const {
return std::hash<std::string>()(x.kernel_strings->get_hash());
}
};
typedef std::string kernel_id;
typedef cl::KernelIntel kernel_type;
using sorted_code = std::map<std::string, program_code>;
using kernels_map = std::map<std::string, kernel_type>;
using kernels_code = std::map<std::string, kernel_code>;
using kernels_code = std::unordered_set<kernel_code, hash_kernel_code>;
private:
gpu_toolkit& _context;

View File

@ -166,10 +166,6 @@ kernels_cache& gpu_toolkit::get_kernels_cache(uint32_t prog_id) {
return get_program_state(prog_id)._kernels_cache;
}
void gpu_toolkit::store_binaries(kernels_binaries_vector binaries, uint32_t prog_id) {
get_program_state(prog_id)._binaries.push_back(binaries);
}
void gpu_toolkit::add_network(uint32_t net_id) {
std::lock_guard<std::mutex> lock(toolkit_mutex);
command_queues_builder queue_builder(context(), device(), _device->get_platform());

View File

@ -62,7 +62,6 @@ protected:
struct gpu_program_state {
kernels_cache _kernels_cache;
kernels_binaries_container _binaries;
gpu_program_state(gpu_toolkit& context, uint32_t prog_id) :
_kernels_cache(context, prog_id) {}
@ -87,7 +86,6 @@ public:
device_info_internal get_device_info() const { return _device->get_info(); }
std::shared_ptr<kernel_selector::TuningCache> get_device_cache() const { return _device_cache; }
kernels_cache& get_kernels_cache(uint32_t prog_id);
void store_binaries(kernels_binaries_vector binaries, uint32_t prog_id);
bool get_serialization_flag() { return _serialize; }
void set_serialization_flag(bool serialization_flag) { _serialize = serialization_flag; }
@ -136,7 +134,6 @@ private:
std::map<uint32_t, std::shared_ptr<gpu_program_state>> _program_states;
std::map<uint32_t, gpu_queue> _command_queues_w;
std::shared_ptr<kernel_selector::TuningCache> _device_cache;
kernels_binaries_container _binaries;
bool _serialize = false;
std::string _extensions;