[GPU] Remove duplicated OpenCL kernel compilation on static model (#16262)

* * update kernel_ids using hash value
* Change set to unordered_map for kernels_code
* replace unique_id to hash value
* Remove hash_val params
* remove redundant codes (#16262)
** Remove unique_id in program_node
** Remove gen_kernel_id
** Remove set_kernels_source
** Remove remove_kernels
** Remove kernel_idx in kernels_cache

* * Use kernel_impl_params instead of kernel_id
* Divide batch when entry_point are duplicated
* rollback removing unique_id

* * Fix get_kernel failure issue (#102467)
 - Modify has function of custom_gpu_primitive and generic_layer
 - Add ==operation of generic_layer for _kernels map in kernels_cache
 - Fix invalid kernel_impl_params related to unique_ptr life cycle issue

* Improve kernels_cache (#102467)
* Move add_kernels_source step to build_implementations
* Change replace kernels_code key to kernel_impl_params
* Return kernel vector in get_kernels

* Modify function name to get_kernels (#102467)

* Fix functions related graph serialization (#102467)

* Fix failure to run dynamic model (#102467)

* Add unit test

* Code review follow-up
- Add const to input params
- Add missing code to check kernel duplication in kernels_cache

* Add const to input params (#102467)

* [GPU] update hash and ==operator for generic_layer and custom_gpu_primitive (#102467)

* [GPU] override get_kernels_source in generic_layer and custom_gpu_primitive (#102467)

* [GPU] Fix onednn build error (#102467)

* [GPU] Fix Lin build error (#102467)

* [GPU] kernels_cache::get_kernels return vector of clone of cldnn::kernel (#102467)

* Updated serialization logics for improved kernel caches (#16262)

* primitive key kernel cache for serialization
* kernel serialization with binaries hash
* fix kernel cache init function for deserialization
* removed unnecessary codes

* [GPU] Update commnet and fix test failure (#16262)

* [GPU] Fix custom_gpu_primitive unit test failures (#16262)

* [GPU] Improved kernels cache serialization (#16262)
* removed hash in serialization logic
* update not to create a new kernels_cache for serialization
* code refactoring in serialization logic

* [GPU] Follow-up code review (#16262)

* [GPU] modify lock(#16262)

* [GPU] Fix custom_gpu_primitive unit test failure (#16262)

---------

Co-authored-by: Eddy Kim <eddy.kim@intel.com>
This commit is contained in:
Paul Youngsoo Ahn 2023-03-29 01:48:19 +09:00 committed by GitHub
parent 17c3e67336
commit 253e4eb366
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
39 changed files with 576 additions and 311 deletions

View File

@ -239,16 +239,13 @@ public:
std::shared_ptr<InferenceEngine::CPUStreamsExecutor> task_executor,
bool is_internal);
static void init_primitives();
kernel_id add_kernel(const std::shared_ptr<kernel_string>& kernel_sring);
kernel::ptr get_kernel(kernel_id id);
kernels_cache& get_kernels_cache() const;
// returns {-1, -1} if it failed to estimate by allocating given batch size
std::pair<int64_t/*const alloc*/, int64_t/*general alloc*/> get_estimated_device_mem_usage();
void remove_kernel(kernel_id id);
using ImplementationsCache = cldnn::LruCacheThreadSafe<kernel_impl_params, std::shared_ptr<primitive_impl>, kernel_impl_params::Hasher>;
ImplementationsCache& get_implementations_cache() const { return *_impls_cache; }
ICompilationContext& get_compilation_context() const { return *_compilation_context; }
void cancel_compilation_context();

View File

@ -23,8 +23,8 @@ public:
"[GPU] Failed to write " + std::to_string(size) + " bytes to stream! Wrote " + std::to_string(written_size));
}
void setKernlImplParams(void* impl_params) { _impl_params = impl_params; }
void* getKernlImplParams() const { return _impl_params; }
void setKernelImplParams(void* impl_params) { _impl_params = impl_params; }
void* getKernelImplParams() const { return _impl_params; }
private:
std::ostream& stream;
@ -42,8 +42,8 @@ public:
"[GPU] Failed to read " + std::to_string(size) + " bytes from stream! Read " + std::to_string(read_size));
}
void setKernlImplParams(void* impl_params) { _impl_params = impl_params; }
void* getKernlImplParams() const { return _impl_params; }
void setKernelImplParams(void* impl_params) { _impl_params = impl_params; }
void* getKernelImplParams() const { return _impl_params; }
void setNetwork(void* network) { _network = network; }
void* getNetwork() const { return _network; }

View File

@ -29,6 +29,10 @@ struct custom_gpu_primitive : public primitive_base<custom_gpu_primitive> {
struct arg_desc {
arg_type type;
arg_index index;
bool operator==(const arg_desc& rhs) const {
return (type == rhs.type && index == rhs.index);
}
};
/// @brief Constructs custom_gpu_primitive primitive
@ -77,7 +81,14 @@ struct custom_gpu_primitive : public primitive_base<custom_gpu_primitive> {
size_t hash() const override {
size_t seed = primitive::hash();
seed = hash_combine(seed, kernel_entry_point);
seed = hash_combine(seed, kernels_code.size());
for (auto& args : kernel_arguments) {
seed = hash_combine(seed, args.index);
seed = hash_combine(seed, args.type);
}
seed = hash_combine(seed, build_options);
seed = hash_range(seed, kernels_code.begin(), kernels_code.end());
seed = hash_range(seed, gws.begin(), gws.end());
seed = hash_range(seed, lws.begin(), lws.end());
return seed;
}
@ -87,9 +98,25 @@ struct custom_gpu_primitive : public primitive_base<custom_gpu_primitive> {
auto rhs_casted = downcast<const custom_gpu_primitive>(rhs);
return kernel_entry_point == rhs_casted.kernel_entry_point &&
build_options == rhs_casted.build_options &&
kernels_code.size() == rhs_casted.kernels_code.size();
if (kernel_entry_point != rhs_casted.kernel_entry_point)
return false;
if (build_options != rhs_casted.build_options)
return false;
if (kernel_arguments != rhs_casted.kernel_arguments)
return false;
if (kernels_code != rhs_casted.kernels_code)
return false;
if (gws != rhs_casted.gws)
return false;
if (lws != rhs_casted.lws)
return false;
return true;
}
};
} // namespace cldnn

View File

@ -531,7 +531,8 @@ struct layout {
auto pshape = get_partial_shape();
for (size_t idx = 0; idx < pshape.size(); idx++) {
seed = hash_combine(seed, pshape[idx].get_length());
auto v = pshape[idx].is_dynamic() ? -1 : pshape[idx].get_length();
seed = hash_combine(seed, v);
}
return seed;
}

View File

@ -16,11 +16,18 @@ void build_implementations::run(program& p) {
}
auto& cache = p.get_kernels_cache();
for (auto& n : p.get_processing_order()) {
if (auto impl = n->get_selected_impl()) {
auto params = n->get_kernel_impl_params();
cache.add_kernels_source(*params, impl->get_kernels_source());
}
}
cache.build_all();
for (auto& n : p.get_processing_order()) {
if (n->get_selected_impl()) {
n->get_selected_impl()->init_kernels(cache);
n->get_selected_impl()->reset_kernels_source();
if (auto impl = n->get_selected_impl()) {
auto params = n->get_kernel_impl_params();
impl->init_kernels(cache, *params);
impl->reset_kernels_source();
}
}
cache.reset();

View File

@ -65,13 +65,9 @@ void compile_graph::run(program& p) {
can_select_impl = false;
if (can_select_impl) {
tasks.push_back([node, &p, &exception] {
tasks.push_back([node, &exception] {
try {
node->selected_impl = node->type()->choose_impl(*node);
if (node->selected_impl) {
auto kernel_ids = p.get_kernels_cache().add_kernels_source(node->selected_impl->get_kernels_source());
node->selected_impl->set_kernel_ids(kernel_ids);
}
} catch(...) {
exception = std::current_exception();
}

View File

@ -70,8 +70,8 @@ void post_input_reorder::run(program& p) {
node->set_output_layout(previous_layout, false);
reorder.set_selected_impl(reorder.type()->choose_impl(reorder));
if (auto impl = reorder.get_selected_impl()) {
auto kernel_ids = p.get_kernels_cache().add_kernels_source(impl->get_kernels_source());
impl->set_kernel_ids(kernel_ids);
auto params = reorder.get_kernel_impl_params();
p.get_kernels_cache().add_kernels_source(*params, impl->get_kernels_source());
}
}
}

View File

@ -57,8 +57,8 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
if ((!g_node.is_constant()) && (!reorder.second)) {
g_node.set_selected_impl(g_node.type()->choose_impl(g_node));
if (auto impl = g_node.get_selected_impl()) {
auto kernel_ids = p.get_kernels_cache().add_kernels_source(impl->get_kernels_source());
impl->set_kernel_ids(kernel_ids);
auto params = g_node.get_kernel_impl_params();
p.get_kernels_cache().add_kernels_source(*params, impl->get_kernels_source());
}
}
}

View File

@ -36,8 +36,8 @@ void remove_redundant_reorders::run(program& p) {
node.set_unique_id();
node.set_selected_impl(node.type()->choose_impl(node));
if (auto impl = node.get_selected_impl()) {
auto kernel_ids = p.get_kernels_cache().add_kernels_source(impl->get_kernels_source());
impl->set_kernel_ids(kernel_ids);
auto params = node.get_kernel_impl_params();
p.get_kernels_cache().add_kernels_source(*params, impl->get_kernels_source());
}
};

View File

@ -54,7 +54,7 @@ struct condition_impl : typed_primitive_impl<condition> {
return make_unique<condition_impl>(arg);
}
void init_kernels(const kernels_cache&) override {}
void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}
private:
primitive_id _node_id;

View File

@ -23,7 +23,7 @@ struct loop_impl : typed_primitive_impl<loop> {
return make_unique<loop_impl>(*this);
}
void init_kernels(const kernels_cache&) override {}
void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}
loop_impl() : parent() {}

View File

@ -29,7 +29,7 @@ public:
return make_unique<wait_for_events_impl>(*this);
}
void init_kernels(const kernels_cache&) override {}
void init_kernels(const kernels_cache&, const kernel_impl_params&) override {}
void set_arguments(primitive_inst& /*instance*/) override {}
kernel_arguments_data get_arguments(const primitive_inst& /*instance*/) const override {
kernel_arguments_data args;

View File

@ -61,7 +61,7 @@ struct assign_impl : public typed_primitive_impl<assign> {
return ev_set_memory;
}
void init_kernels(const kernels_cache&) override {}
void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}
public:
static std::unique_ptr<primitive_impl> create(const assign_node& arg, const kernel_impl_params& impl_param) {

View File

@ -845,7 +845,7 @@ public:
return ev;
}
void init_kernels(const kernels_cache&) override {}
void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}
static std::unique_ptr<primitive_impl> create(const detection_output_node& arg, const kernel_impl_params&) {
return make_unique<detection_output_impl>(arg);

View File

@ -419,7 +419,7 @@ struct non_max_suppression_impl : typed_primitive_impl<non_max_suppression> {
static std::unique_ptr<primitive_impl> create(const non_max_suppression_node&, const kernel_impl_params&) {
return make_unique<non_max_suppression_impl>();
}
void init_kernels(const kernels_cache&) override {}
void init_kernels(const kernels_cache&, const kernel_impl_params&) override {}
};
namespace detail {

View File

@ -426,7 +426,7 @@ struct proposal_impl : typed_primitive_impl<proposal> {
return ev;
}
void init_kernels(const kernels_cache&) override {}
void init_kernels(const kernels_cache&, const kernel_impl_params&) override {}
static std::unique_ptr<primitive_impl> create(const proposal_node& arg, const kernel_impl_params& impl_param) {
const layout& l = impl_param.input_layouts[2];

View File

@ -62,7 +62,7 @@ struct read_value_impl : public typed_primitive_impl<read_value> {
return instance.get_network().get_stream().create_user_event(true);
}
void init_kernels(const kernels_cache&) override {}
void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}
public:
static std::unique_ptr<primitive_impl> create(const read_value_node& arg, const kernel_impl_params& impl_param) {

View File

@ -28,7 +28,7 @@ struct custom_gpu_primitive_impl : typed_primitive_impl<custom_gpu_primitive> {
std::shared_ptr<kernel_selector::cl_kernel_data> cl_kernel;
std::vector<kernel::ptr> _kernels;
kernel_id _kernel_id;
std::string _cached_kernel_id;
std::unique_ptr<primitive_impl> clone() const override {
return make_unique<custom_gpu_primitive_impl>(*this);
@ -40,7 +40,7 @@ struct custom_gpu_primitive_impl : typed_primitive_impl<custom_gpu_primitive> {
custom_gpu_primitive_impl(const custom_gpu_primitive_impl& other)
: cl_kernel(other.cl_kernel)
, _kernels({})
, _kernel_id(other._kernel_id) {
, _cached_kernel_id(other._cached_kernel_id) {
for (const auto& kernel : other._kernels) {
_kernels.emplace_back(kernel->clone());
}
@ -49,12 +49,27 @@ struct custom_gpu_primitive_impl : typed_primitive_impl<custom_gpu_primitive> {
custom_gpu_primitive_impl(const custom_gpu_primitive_node& arg,
std::shared_ptr<kernel_selector::cl_kernel_data>& cl_kernel)
: cl_kernel(cl_kernel)
, _kernels() {
_kernel_id = arg.get_program().add_kernel(cl_kernel->code.kernelString);
, _kernels()
, _cached_kernel_id() { }
std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() override {
std::vector<std::shared_ptr<cldnn::kernel_string>> kernel_strings;
kernel_strings.push_back(cl_kernel->code.kernelString);
return kernel_strings;
}
void init_kernels(const kernels_cache& kernels_cache) override {
_kernels.emplace_back(kernels_cache.get_kernel(_kernel_id));
void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override {
_kernels.clear();
auto compiled_kernels = kernels_cache.get_kernels(params);
_kernels.insert(_kernels.begin(), compiled_kernels.begin(), compiled_kernels.end());
}
void init_by_cached_kernels(const kernels_cache& kernels_cache) override {
_kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(_cached_kernel_id));
}
void set_cached_kernel_ids(const kernels_cache& kernels_cache) override {
_cached_kernel_id = kernels_cache.get_cached_kernel_id(_kernels[0]);
}
void set_arguments_impl(custom_gpu_primitive_inst& instance) override {
@ -78,23 +93,19 @@ struct custom_gpu_primitive_impl : typed_primitive_impl<custom_gpu_primitive> {
return stream.enqueue_kernel(*_kernels.front(), cl_kernel.get()->params, args, events, instance.is_output());
}
std::vector<std::string> get_kernel_ids() const override {
return {_kernel_id};
}
std::vector<kernel::ptr> get_kernels() const override {
return _kernels;
}
void save(BinaryOutputBuffer& ob) const override {
ob << *cl_kernel;
ob << _kernel_id;
ob << _cached_kernel_id;
}
void load(BinaryInputBuffer& ib) override {
cl_kernel = std::make_shared<kernel_selector::cl_kernel_data>();
ib >> *cl_kernel;
ib >> _kernel_id;
ib >> _cached_kernel_id;
}
};

View File

@ -15,7 +15,7 @@ struct generic_layer_impl : typed_primitive_impl<generic_layer> {
kernel_selector::cl_kernel_data _cl_kernel_data;
std::vector<kernel::ptr> _kernels;
kernel_id _kernel_id;
std::string _cached_kernel_id;
DECLARE_OBJECT_TYPE_SERIALIZATION
@ -28,7 +28,7 @@ struct generic_layer_impl : typed_primitive_impl<generic_layer> {
generic_layer_impl(const generic_layer_impl& other)
: _cl_kernel_data(other._cl_kernel_data)
, _kernels({})
, _kernel_id(other._kernel_id) {
, _cached_kernel_id(other._cached_kernel_id) {
if (other._kernels.empty()) {
throw std::runtime_error("Can't copy generic_layer_impl node: kernels vector is empty");
}
@ -37,22 +37,41 @@ struct generic_layer_impl : typed_primitive_impl<generic_layer> {
generic_layer_impl(const generic_layer_node& arg)
: _cl_kernel_data(*arg.get_primitive()->generic_params.clKernel.get())
, _kernels() {
_kernel_id = arg.get_program().add_kernel(arg.get_primitive()->generic_params.clKernel->code.kernelString);
, _kernels()
, _cached_kernel_id() { }
std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() override {
std::vector<std::shared_ptr<cldnn::kernel_string>> kernel_strings;
kernel_strings.push_back(_cl_kernel_data.code.kernelString);
return kernel_strings;
}
std::vector<kernel::ptr> get_kernels() const override {
return _kernels;
}
void save(BinaryOutputBuffer& ob) const override {
ob <<_cl_kernel_data;
ob << _kernel_id;
ob << _cached_kernel_id;
}
void load(BinaryInputBuffer& ib) override {
ib >> _cl_kernel_data;
ib >> _kernel_id;
ib >> _cached_kernel_id;
}
void init_kernels(const kernels_cache& kernels_cache) override {
_kernels.push_back(kernels_cache.get_kernel(_kernel_id));
void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override {
_kernels.clear();
auto compiled_kernels = kernels_cache.get_kernels(params);
_kernels.insert(_kernels.begin(), compiled_kernels.begin(), compiled_kernels.end());
}
void init_by_cached_kernels(const kernels_cache& kernels_cache) override {
_kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(_cached_kernel_id));
}
void set_cached_kernel_ids(const kernels_cache& kernels_cache) override {
_cached_kernel_id = kernels_cache.get_cached_kernel_id(_kernels[0]);
}
void set_arguments_impl(generic_layer_inst& instance) override {
@ -114,7 +133,7 @@ struct generic_layer_cpu : typed_primitive_impl<generic_layer> {
return ev;
}
void init_kernels(const kernels_cache&) override {}
void init_kernels(const kernels_cache&, const kernel_impl_params&) override {}
};
static std::unique_ptr<primitive_impl> create(const generic_layer_node& arg, const kernel_impl_params&) {

View File

@ -101,12 +101,13 @@ bool query_local_block_io_supported(engine& e, const ExecutionConfig& config) {
kernel_string->batch_compilation = true;
try {
kernel_impl_params dummy_params;
auto _kernels_cache_device_query = std::unique_ptr<kernels_cache>(new kernels_cache(e, config, 0));
auto id = _kernels_cache_device_query->set_kernel_source(kernel_string, false);
_kernels_cache_device_query->add_kernels_source(dummy_params, {kernel_string}, false);
_kernels_cache_device_query->build_all();
auto kernel = _kernels_cache_device_query->get_kernel(id);
cache[device] = _kernels_cache_device_query->validate_simple_kernel_execution(kernel);
auto _kernels = _kernels_cache_device_query->get_kernels(dummy_params);
cache[device] = _kernels_cache_device_query->validate_simple_kernel_execution(_kernels[0]);
} catch (std::exception& /*ex*/) {
cache[device] = false;
}
@ -1202,7 +1203,7 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
const auto& config = program->get_config();
const auto& device_info = engine.get_device_info();
params.uniqueID = std::to_string(param_info.unique_id);
params.uniqueID = std::to_string(param_info.hash());
params.engineInfo.supports_fp16 = device_info.supports_fp16;
params.engineInfo.supports_fp64 = device_info.supports_fp64;
params.engineInfo.supports_fp16_denorms = device_info.supports_fp16_denorms;

View File

@ -32,10 +32,10 @@ For example, all gpu convolution implementations should derive from typed_primit
template <class PType>
struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
kernel_selector::kernel_data _kernel_data;
std::vector<kernel_id> _kernel_ids;
std::vector<std::string> _cached_kernel_ids;
std::vector<kernel::ptr> _kernels;
typed_primitive_impl_ocl() : _kernel_data({}), _kernel_ids({}), _kernels({}) {
typed_primitive_impl_ocl() : _kernel_data({}), _cached_kernel_ids({}), _kernels({}) {
_kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE;
_kernel_data.weightsReorderParams.cpuKernel = nullptr;
_kernel_data.weightsReorderParams.clKernel = nullptr;
@ -44,7 +44,7 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
typed_primitive_impl_ocl(const typed_primitive_impl_ocl<PType>& other)
: typed_primitive_impl<PType>(other._weights_reorder_params, other._kernel_name, other._is_dynamic)
, _kernel_data(other._kernel_data)
, _kernel_ids(other._kernel_ids)
, _cached_kernel_ids(other._cached_kernel_ids)
, _kernels({}) {
_kernels.reserve(other._kernels.size());
for (size_t k = 0; k < other._kernels.size(); ++k) {
@ -68,20 +68,19 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
// Cache blob format:
// [ kernel_selector::kernel_data ]
// [ kernel_id ]
// [ kernel_arguments ]
// [ kernel_ids ]
void save(BinaryOutputBuffer& ob) const override {
ob << make_data(&_kernel_data.internalBufferDataType, sizeof(kernel_selector::Datatype));
ob << _kernel_data.internalBufferSizes;
ob << _kernel_data.kernels;
ob << _kernel_ids;
ob << _cached_kernel_ids;
}
void load(BinaryInputBuffer& ib) override {
ib >> make_data(&_kernel_data.internalBufferDataType, sizeof(kernel_selector::Datatype));
ib >> _kernel_data.internalBufferSizes;
ib >> _kernel_data.kernels;
ib >> _kernel_ids;
ib >> _cached_kernel_ids;
}
template<typename ImplType>
@ -134,20 +133,32 @@ protected:
return stream.enqueue_marker(events, is_output);
}
void init_kernels(const kernels_cache& kernels_cache) override {
void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override {
if (is_cpu()) {
return;
}
_kernels.clear();
if (!_kernel_data.kernels.empty()) {
auto compiled_kernels = kernels_cache.get_kernels(params);
_kernels.insert(_kernels.begin(), compiled_kernels.begin(), compiled_kernels.end());
}
}
void init_by_cached_kernels(const kernels_cache& kernels_cache) override {
if (is_cpu()) {
return;
}
_kernels.clear();
_kernels.reserve(_kernel_ids.size());
for (size_t k = 0; k < _kernel_ids.size(); ++k) {
_kernels.emplace_back(kernels_cache.get_kernel(_kernel_ids[k]));
_kernels.reserve(_cached_kernel_ids.size());
for (size_t k = 0; k < _cached_kernel_ids.size(); ++k) {
_kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(_cached_kernel_ids[k]));
}
}
std::vector<std::string> get_kernel_ids() const override {
return _kernel_ids;
void set_cached_kernel_ids(const kernels_cache& kernels_cache) override {
_cached_kernel_ids = kernels_cache.get_cached_kernel_ids(_kernels);
}
std::vector<kernel::ptr> get_kernels() const override {
@ -258,10 +269,6 @@ protected:
return aggregate_events(all_events, stream, group_events);
}
void set_kernel_ids(std::vector<kernel_id> kernel_ids) override {
_kernel_ids = kernel_ids;
}
std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() override {
std::vector<std::shared_ptr<cldnn::kernel_string>> kernel_strings;
for (size_t i = 0; i < _kernel_data.kernels.size(); ++i) {
@ -283,18 +290,26 @@ protected:
}
}
void set_kernels(std::map<const std::string, kernel::ptr>& kernels) override {
void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override {
if (is_cpu())
return;
_kernel_ids.clear();
size_t total_kernels_num = std::accumulate(kernels.begin(), kernels.end(), 0,
[](size_t val, cldnn::kernels_cache::compiled_kernels::value_type& p) {
return (val + p.second.size());
});
_kernels.clear();
_kernels.reserve(kernels.size());
_kernels.reserve(total_kernels_num);
for (auto& k : kernels) {
_kernel_ids.push_back(k.first);
_kernels.emplace_back(std::move(k.second));
_kernels.insert(_kernels.end(), k.second.begin(), k.second.end());
}
}
std::vector<kernel::ptr> get_kernels() override {
return _kernels;
}
};
} // namespace ocl

View File

@ -78,7 +78,7 @@ public:
parent::save(ob);
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ob.getKernlImplParams());
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ob.getKernelImplParams());
auto prim = impl_params->typed_desc<concatenation>();
ob << prim->axis;
@ -101,7 +101,7 @@ public:
int64_t prim_axis;
ib >> prim_axis;
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernlImplParams());
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());
auto prim_desc = get_concatenation_primitive_descriptor(*impl_params, ib.get_engine(), *_attrs, prim_axis);
_pd = *prim_desc;

View File

@ -196,7 +196,7 @@ public:
_attrs->set_zero_points_mask(DNNL_ARG_SRC, _zero_point_mask);
}
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernlImplParams());
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());
auto input_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(0), dnnl::memory::format_tag::undef);
auto weights_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(1), dnnl::memory::format_tag::any);

View File

@ -108,7 +108,7 @@ public:
#ifdef ONEDNN_PRIMITIVE_SERIALIZATION
parent::load(ib);
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernlImplParams());
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());
auto input_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(0), dnnl::memory::format_tag::undef);
auto weights_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(1), dnnl::memory::format_tag::any);

View File

@ -159,7 +159,7 @@ public:
#ifdef ONEDNN_PRIMITIVE_SERIALIZATION
parent::save(ob);
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ob.getKernlImplParams());
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ob.getKernelImplParams());
auto prim = impl_params->typed_desc<fully_connected>();
size_t input_size = prim->input_size;
bool has_bias = !prim->bias.empty();
@ -181,7 +181,7 @@ public:
ib >> input_size;
ib >> has_bias;
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernlImplParams());
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());
auto prim_desc = get_fully_connected_primitive_descriptor(*impl_params, ib.get_engine(), input_size, has_bias, *_attrs);
_pd = *prim_desc;

View File

@ -178,7 +178,7 @@ public:
#ifdef ONEDNN_PRIMITIVE_SERIALIZATION
parent::save(ob);
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ob.getKernlImplParams());
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ob.getKernelImplParams());
auto prim = impl_params->typed_desc<gemm>();
bool gemm_with_bias = prim->dependencies().size() == 3;

View File

@ -108,7 +108,7 @@ public:
#ifdef ONEDNN_PRIMITIVE_SERIALIZATION
parent::load(ib);
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernlImplParams());
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());
dnnl::algorithm alg;
ib >> make_data(&alg, sizeof(dnnl::algorithm));

View File

@ -200,7 +200,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
_attrs->set_fpmath_mode(_fmath_mode);
}
{
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernlImplParams());
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());
const std::vector<cldnn::fused_primitive_desc_onednn>& fused_desc = impl_params->fused_desc_onednn;
dnnl::post_ops _post_ops;
int post_ops_len;
@ -451,7 +451,7 @@ protected:
return args;
}
void init_kernels(const kernels_cache&) override { }
void init_kernels(const kernels_cache&, const kernel_impl_params&) override { }
event::ptr aggregate_events(const std::vector<event::ptr>& events, stream& stream, bool group = false, bool is_output = false) const {
if (events.size() == 1 && !is_output)

View File

@ -119,7 +119,7 @@ public:
#ifdef ONEDNN_PRIMITIVE_SERIALIZATION
parent::load(ib);
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernlImplParams());
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());
dnnl::algorithm alg;
ib >> make_data(&alg, sizeof(dnnl::algorithm));

View File

@ -77,7 +77,7 @@ public:
#ifdef ONEDNN_PRIMITIVE_SERIALIZATION
parent::load(ib);
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernlImplParams());
const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());
auto input_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(0));
auto output_md = onednn::layout_to_memory_desc(impl_params->get_output_layout());

View File

@ -38,10 +38,102 @@ struct generic_layer : public primitive_base<generic_layer> {
size_t hash() const override {
size_t seed = primitive::hash();
seed = hash_combine(seed, id);
seed = hash_combine(seed, generic_params.engine);
if (generic_params.cpuKernel != nullptr) {
auto& cpuKernel = generic_params.cpuKernel;
seed = hash_combine(seed, cpuKernel->GetExpectedInputLayout());
seed = hash_combine(seed, cpuKernel->GetExpectedInputType());
}
if (generic_params.clKernel != nullptr) {
auto& clKernel = generic_params.clKernel;
seed = hash_combine(seed, clKernel->skip_execution);
auto& gws = clKernel->params.workGroups.global;
seed = hash_range(seed, gws.begin(), gws.end());
auto& lws = clKernel->params.workGroups.local;
seed = hash_range(seed, lws.begin(), lws.end());
auto& arguments = clKernel->params.arguments;
for (auto& args : arguments) {
seed = hash_combine(seed, args.index);
seed = hash_combine(seed, args.t);
}
auto& scalars = clKernel->params.scalars;
for (auto& s : scalars) {
seed = hash_combine(seed, s.t);
}
seed = hash_combine(seed, clKernel->code.kernelString->get_hash());
}
return seed;
}
bool operator==(const primitive& rhs) const override {
if (!compare_common_params(rhs))
return false;
auto rhs_casted = downcast<const generic_layer>(rhs);
if (generic_params.engine != rhs_casted.generic_params.engine)
return false;
if (generic_params.cpuKernel != nullptr) {
if (generic_params.cpuKernel->GetExpectedInputLayout() != rhs_casted.generic_params.cpuKernel->GetExpectedInputLayout())
return false;
if (generic_params.cpuKernel->GetExpectedInputType() != rhs_casted.generic_params.cpuKernel->GetExpectedInputType())
return false;
}
if (generic_params.clKernel != nullptr) {
auto& clKernel = generic_params.clKernel;
auto& clKernel_rhs = rhs_casted.generic_params.clKernel;
if (clKernel->skip_execution != clKernel_rhs->skip_execution)
return false;
auto& gws = clKernel->params.workGroups.global;
auto& gws_rhs = clKernel_rhs->params.workGroups.global;
if (gws != gws_rhs)
return false;
auto& lws = clKernel->params.workGroups.local;
auto& lws_rhs = clKernel_rhs->params.workGroups.local;
if (lws != lws_rhs)
return false;
auto& arguments = clKernel->params.arguments;
auto& arguments_rhs = clKernel_rhs->params.arguments;
if (arguments.size() != arguments_rhs.size())
return false;
for (size_t idx = 0; idx < arguments.size(); idx++) {
if (arguments[idx].index != arguments_rhs[idx].index)
return false;
if (arguments[idx].t != arguments_rhs[idx].t)
return false;
}
auto& scalars = clKernel->params.scalars;
auto& scalars_rhs = clKernel_rhs->params.scalars;
if (scalars.size() != scalars_rhs.size())
return false;
for (size_t idx = 0; idx < scalars.size(); idx++) {
if (scalars[idx].t != scalars_rhs[idx].t)
return false;
}
if (clKernel->code.kernelString->get_str() != clKernel_rhs->code.kernelString->get_str())
return false;
}
return true;
}
protected:
std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override { return {}; }
};

View File

@ -59,15 +59,13 @@ struct primitive_impl {
kernel_selector::weights_reorder_params _weights_reorder_params;
// class typed_primitive_gpu_impl override this with return false;
virtual bool is_cpu() const { return true; }
virtual void init_kernels(const kernels_cache&) = 0;
virtual void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) = 0;
virtual void init_by_cached_kernels(const kernels_cache&) {}
virtual void set_cached_kernel_ids(const kernels_cache&) {}
virtual std::unique_ptr<primitive_impl> clone() const = 0;
virtual std::vector<std::string> get_kernel_ids() const {
return {};
}
virtual std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() { return {}; }
virtual void reset_kernels_source() {}
virtual std::vector<kernel::ptr> get_kernels() const { return {}; }
virtual void set_kernel_ids(std::vector<kernel_id> kernel_ids) {}
virtual void save(cldnn::BinaryOutputBuffer& ob) const {}
virtual void load(cldnn::BinaryInputBuffer& ib) {}
@ -88,7 +86,8 @@ struct primitive_impl {
return primitive_impl::static_canonicalize_shapes(impl_params);
}
virtual void set_kernels(std::map<const std::string, kernel::ptr>& kernels) {}
virtual void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) {}
virtual std::vector<kernel::ptr> get_kernels() { return {}; }
protected:
std::string _kernel_name;
@ -163,8 +162,13 @@ public:
event::ptr execute(const std::vector<event::ptr>& events);
void init_kernels(const kernels_cache& kernels_cache) {
_impl->init_kernels(kernels_cache);
_impl->init_kernels(kernels_cache, *_impl_params);
}
void init_by_cached_kernels(const kernels_cache& kernels_cache) {
_impl->init_by_cached_kernels(kernels_cache);
}
void set_arguments();
void validate() const {

View File

@ -15,7 +15,9 @@
namespace cldnn {
size_t kernel_impl_params::hash() const {
size_t seed = desc->hash();
size_t seed = 0;
if (desc != nullptr)
seed = desc->hash();
const size_t prime_number = 2654435761; // magic number to reduce hash collision rate.
for (auto& in : input_layouts) {
seed = hash_combine(seed, in.hash() * prime_number);
@ -32,7 +34,10 @@ size_t kernel_impl_params::hash() const {
}
bool kernel_impl_params::operator==(const kernel_impl_params& rhs) const {
if (*desc != *rhs.desc)
if ((desc != nullptr && rhs.desc == nullptr) || (desc == nullptr && rhs.desc != nullptr))
return false;
if ((desc != nullptr && rhs.desc != nullptr) && *desc != *rhs.desc)
return false;
if (rhs.input_layouts.size() != input_layouts.size())

View File

@ -405,7 +405,7 @@ network::network(cldnn::BinaryInputBuffer& ib, const ExecutionConfig& config, st
ib >> *p_inst;
_primitives[p_inst->id()] = p_inst;
if (p_inst->get_impl() != nullptr)
p_inst->init_kernels(kernels_cache);
p_inst->init_by_cached_kernels(kernels_cache);
}
for (auto& item : _primitives) {
@ -515,10 +515,12 @@ network::~network() {
// [ executable primitive_inst ]
// [ memory reuse information ]
void network::save(cldnn::BinaryOutputBuffer& ob) {
kernels_cache kernels_cache(get_engine(), _config, 0, nullptr, {""});
auto& kernels_cache = _program->get_kernels_cache();
kernels_cache.reset();
for (const auto& p_inst : _exec_order) {
if (p_inst->get_impl() != nullptr)
kernels_cache.add_kernels(p_inst->get_impl()->get_kernel_ids(), p_inst->get_impl()->get_kernels());
if (p_inst->get_impl() != nullptr) {
kernels_cache.add_to_cached_kernels(p_inst->get_impl()->get_kernels());
}
}
ob << kernels_cache;
@ -597,6 +599,7 @@ void network::save(cldnn::BinaryOutputBuffer& ob) {
}
ob << get_ext_id_mapping();
kernels_cache.reset();
}
network::ptr network::allocate_network(stream::ptr stream, program::ptr program, bool is_internal, bool is_primary_stream) {

View File

@ -383,7 +383,7 @@ bool primitive_inst::update_impl() {
}
auto impl = _node->type()->choose_impl(*_node, updated_params);
auto kernels = _program->get_kernels_cache().compile(impl->get_kernels_source());
auto kernels = _program->get_kernels_cache().compile(updated_params, impl->get_kernels_source());
impl->set_kernels(kernels);
cache.add(updated_params, impl->clone());
});
@ -395,7 +395,7 @@ bool primitive_inst::update_impl() {
} else {
_impl = _node->type()->choose_impl(*_node, updated_params);
auto& kernels_cache = get_network().get_program()->get_kernels_cache();
auto kernels = kernels_cache.compile(_impl->get_kernels_source());
auto kernels = kernels_cache.compile(updated_params, _impl->get_kernels_source());
_impl->set_kernels(kernels);
cache.add(updated_params, _impl->clone());
@ -736,9 +736,9 @@ event::ptr primitive_inst::update_weights() {
GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights from " << original_layout.to_short_string()
<< " to " << expected_layout.to_short_string() << std::endl;
auto& kernels_cache = get_network().get_program()->get_kernels_cache();
auto kernels = kernels_cache.compile({weights_params.clKernel->code.kernelString});
auto kernels = kernels_cache.compile(*_impl_params, {weights_params.clKernel->code.kernelString});
OPENVINO_ASSERT(kernels.size() == 1, "The output of kernel compile has issue");
kernel = kernels.begin()->second;
kernel = (kernels.begin()->second)[0];
cache.add(kernel_key, kernel);
}
@ -1096,7 +1096,7 @@ static primitive_id find_dep_by_mem(const cldnn::primitive_inst* p_inst, memory&
// [ intermediate memory information ]
void primitive_inst::save(cldnn::BinaryOutputBuffer& ob) const {
_impl_params->save(ob);
ob.setKernlImplParams(_impl_params.get());
ob.setKernelImplParams(_impl_params.get());
ob << _node_output_layout;
ob << has_mutable_input();
@ -1169,6 +1169,7 @@ void primitive_inst::save(cldnn::BinaryOutputBuffer& ob) const {
if (_impl != nullptr) {
ob << true;
_impl->set_cached_kernel_ids(_network.get_program()->get_kernels_cache());
ob << _impl;
} else {
ob << false;
@ -1186,7 +1187,7 @@ int32_t primitive_inst::get_index_in_deps(memory::cptr arg) const {
void primitive_inst::load(cldnn::BinaryInputBuffer& ib) {
_impl_params->load(ib);
ib.setKernlImplParams(_impl_params.get());
ib.setKernelImplParams(_impl_params.get());
ib >> _node_output_layout;
ib >> _has_mutable_input;

View File

@ -229,14 +229,6 @@ std::shared_ptr<InferenceEngine::CPUStreamsExecutor> program::make_task_executor
return std::make_shared<InferenceEngine::CPUStreamsExecutor>(task_executor_config);
}
kernel_id program::add_kernel(const std::shared_ptr<kernel_string>& kernelSring) {
return _kernels_cache->set_kernel_source(kernelSring, false);
}
kernel::ptr program::get_kernel(kernel_id id) {
return _kernels_cache->get_kernel(id);
}
kernels_cache& program::get_kernels_cache() const {
return *_kernels_cache;
}
@ -1640,10 +1632,6 @@ std::pair<int64_t, int64_t> program::get_estimated_device_mem_usage() {
return std::make_pair(const_sum, get_engine().get_used_device_memory(allocation_type::usm_device));
}
void program::remove_kernel(kernel_id id) {
_kernels_cache->remove_kernel(id);
}
void program::cancel_compilation_context() {
if (_compilation_context != nullptr)
_compilation_context->cancel();

View File

@ -54,7 +54,6 @@ std::string reorder_options(const std::string& org_options) {
} // namespace
namespace cldnn {
std::atomic<size_t> kernels_cache::_kernel_idx{0};
std::mutex kernels_cache::_mutex;
std::string kernels_cache::get_cache_path() const {
@ -90,48 +89,54 @@ void kernels_cache::get_program_source(const kernels_code& kernels_source_code,
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::BuildAll::GetProgramSource");
std::map<std::string, std::tuple<int32_t, std::vector<batch_program>>> program_buckets;
for (const auto& code : kernels_source_code) {
std::string full_code = code.kernel_strings->jit + code.kernel_strings->str + code.kernel_strings->undefs;
std::string entry_point = code.kernel_strings->entry_point;
std::string options = code.kernel_strings->options;
bool batch_compilation = code.kernel_strings->batch_compilation;
for (const auto& k : kernels_source_code) {
auto& code = k.second;
bool dump_custom_program = code.dump_custom_program;
if (batch_compilation) {
options = reorder_options(options);
for (auto kernel_string : code.kernel_strings) {
std::string full_code = kernel_string->jit + kernel_string->str + kernel_string->undefs;
std::string entry_point = kernel_string->entry_point;
std::string options = kernel_string->options;
bool batch_compilation = kernel_string->batch_compilation;
if (batch_compilation) {
options = reorder_options(options);
}
std::string key = options;
if (batch_compilation == false) {
key += " __PROGRAM__" + std::to_string(program_buckets.size());
}
if (dump_custom_program) {
key += " __DUMP_CUSTOM_PROGRAM__"; // Adding label to key so it would be separated from other programs
}
auto& bucket_id = std::get<0>(program_buckets[key]);
auto& current_bucket = std::get<1>(program_buckets[key]);
if (current_bucket.empty()) { // new bucket
const auto& batch_id = 0;
// increase bucket id if and only if new bucket comes
bucket_id = static_cast<int32_t>(program_buckets.size() - 1);
current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_header_str));
}
// Create new kernels batch when the limit is reached
// and current kernel's entry_point is duplicated in this kernels batch
if (current_bucket.back().kernels_counter >= get_max_kernels_per_batch()
|| current_bucket.back().entry_point_to_id.find(entry_point) != current_bucket.back().entry_point_to_id.end()) {
const auto& batch_id = static_cast<int32_t>(current_bucket.size());
current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_header_str));
}
auto& current_batch = current_bucket.back();
current_batch.dump_custom_program = dump_custom_program;
current_batch.entry_point_to_id.emplace(entry_point, code.params);
current_batch.source.push_back(std::move(full_code));
current_batch.kernels_counter++;
}
std::string key = options;
if (batch_compilation == false) {
key += " __PROGRAM__" + std::to_string(program_buckets.size());
}
if (dump_custom_program) {
key += " __DUMP_CUSTOM_PROGRAM__"; // Adding label to key so it would be separated from other programs
}
auto& bucket_id = std::get<0>(program_buckets[key]);
auto& current_bucket = std::get<1>(program_buckets[key]);
if (current_bucket.empty()) { // new bucket
const auto& batch_id = 0;
// increase bucket id if and only if new bucket comes
bucket_id = static_cast<int32_t>(program_buckets.size() - 1);
current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_header_str));
}
// Create new kernels batch when the limit is reached
if (current_bucket.back().kernels_counter >= get_max_kernels_per_batch()) {
const auto& batch_id = static_cast<int32_t>(current_bucket.size());
current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_header_str));
}
auto& current_batch = current_bucket.back();
current_batch.dump_custom_program = dump_custom_program;
current_batch.entry_point_to_id[entry_point] = code.id;
current_batch.source.push_back(std::move(full_code));
current_batch.kernels_counter++;
}
// Compute hash value for each batch
@ -165,13 +170,6 @@ kernels_cache::kernels_cache(engine& engine,
, _prog_id(prog_id)
, batch_header_str(std::move(batch_header_str)) { }
kernel_id kernels_cache::set_kernel_source(
const std::shared_ptr<kernel_string>& kernel_string,
bool dump_custom_program) {
auto kernel_ids = add_kernels_source({kernel_string}, dump_custom_program);
return kernel_ids[0];
}
static std::vector<unsigned char> getProgramBinaries(cl::Program program) {
// Get the size of the program binary in bytes.
std::vector<size_t> binary_sizes = program.getInfo<CL_PROGRAM_BINARY_SIZES>();
@ -189,7 +187,7 @@ static std::vector<unsigned char> getProgramBinaries(cl::Program program) {
}
// TODO: This build_batch method should be backend specific
void kernels_cache::build_batch(const engine& build_engine, const batch_program& batch, std::map<const std::string, kernel::ptr>& compiled_kernels) {
void kernels_cache::build_batch(const engine& build_engine, const batch_program& batch, compiled_kernels& compiled_kernels) {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::build_batch");
auto& cl_build_engine = dynamic_cast<const ocl::ocl_engine&>(build_engine);
@ -280,13 +278,17 @@ void kernels_cache::build_batch(const engine& build_engine, const batch_program&
std::lock_guard<std::mutex> lock(_mutex);
for (auto& k : kernels) {
const auto& entry_point = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
const auto& k_id = batch.entry_point_to_id.find(entry_point);
if (k_id != batch.entry_point_to_id.end()) {
const auto& iter = batch.entry_point_to_id.find(entry_point);
if (iter != batch.entry_point_to_id.end()) {
cl_kernel kern = k.get();
cl_context context = cl_build_engine.get_cl_context().get();
kernel::ptr kernel = kernels_factory::create(_engine, context, kern, entry_point);
const auto& kmap = std::make_pair(k_id->second, kernel);
compiled_kernels.insert(kmap);
auto& params = iter->second;
if (compiled_kernels.find(params) != compiled_kernels.end()) {
compiled_kernels[params].push_back(kernel);
} else {
compiled_kernels[params] = { kernel };
}
} else {
throw std::runtime_error("Could not find entry point");
}
@ -328,14 +330,28 @@ void kernels_cache::build_batch(const engine& build_engine, const batch_program&
}
}
kernel::ptr kernels_cache::get_kernel(kernel_id id) const {
if (_pending_compilation)
throw std::runtime_error("Kernel cache is not compiled, call build_all() first!");
kernel::ptr kernels_cache::get_kernel_from_cached_kernels(std::string id) const {
auto res = _cached_kernels.find(id);
OPENVINO_ASSERT(_cached_kernels.end() != res, "[GPU] Kernel " + id + " not found in the cached kernel cache!");
return res->second->clone();
}
auto res = _kernels.find(id);
if (_kernels.end() == res)
throw std::runtime_error("Kernel " + id + " not found in the kernel cache!");
return res->second;
std::vector<kernel::ptr> kernels_cache::get_kernels(kernel_impl_params params) const {
OPENVINO_ASSERT((_pending_compilation == false), "Kernel cache is not compiled, call build_all() first!");
std::string current_node_id;
if (params.desc) {
current_node_id = params.desc->id;
}
auto res = _kernels.find(params);
OPENVINO_ASSERT(_kernels.end() != res, "Kernel for {" + current_node_id + "} is not found in the kernel cache!");
std::vector<kernel::ptr> kernels;
kernels.reserve(res->second.size());
for (auto& k : res->second) {
kernels.emplace_back(k->clone());
}
return kernels;
}
bool kernels_cache::validate_simple_kernel_execution(kernel::ptr krl) {
@ -430,117 +446,110 @@ void kernels_cache::reset() {
_pending_compilation = false;
}
std::vector<kernel_id> kernels_cache::add_kernels_source(std::vector<std::shared_ptr<kernel_string>> kernel_sources, bool dump_custom_program) {
std::vector<kernel_id> kernel_ids;
kernel_ids.reserve(kernel_sources.size());
for (size_t i = 0; i < kernel_sources.size(); ++i) {
std::lock_guard<std::mutex> lock(_mutex);
auto kernel_string = kernel_sources[i];
kernel_id id = gen_kernel_id(kernel_string->entry_point);
auto res = _kernels_code.emplace(kernel_string, id, dump_custom_program);
void kernels_cache::add_kernels_source(const kernel_impl_params& params,
const std::vector<std::shared_ptr<kernel_string>>& kernel_sources,
bool dump_custom_program) {
std::lock_guard<std::mutex> lock(_mutex);
assert(_kernels.find(id) == _kernels.end());
if (!kernel_sources.empty() && (_kernels_code.find(params) == _kernels_code.end())) {
auto res = _kernels_code.insert({params, {kernel_sources, params, dump_custom_program}});
assert(_kernels.find(params) == _kernels.end());
if (res.second) {
_pending_compilation = true;
}
kernel_ids.emplace_back(id);
}
}
std::string kernels_cache::get_cached_kernel_id(kernel::ptr kernel) const {
auto ocl_kernel = std::static_pointer_cast<cldnn::ocl::ocl_kernel>(kernel);
const auto& entry_point = ocl_kernel->get_handle().getInfo<CL_KERNEL_FUNCTION_NAME>();
auto program = ocl_kernel->get_handle().getInfo<CL_KERNEL_PROGRAM>();
cl::vector<unsigned char> program_binaries = getProgramBinaries(program);
auto iter = _cached_binaries.find(program_binaries);
OPENVINO_ASSERT(iter != _cached_binaries.end(), "[GPU] Not found cached kernel binaries");
return entry_point + "@" + std::to_string(iter->second);
}
std::vector<std::string> kernels_cache::get_cached_kernel_ids(const std::vector<kernel::ptr>& kernels) const {
std::vector<std::string> kernel_ids;
for (auto& kernel : kernels) {
auto key = get_cached_kernel_id(kernel);
kernel_ids.emplace_back(key);
}
return kernel_ids;
}
void kernels_cache::add_kernels(const std::vector<std::string>& kernel_ids, const std::vector<kernel::ptr>& kernels) {
OPENVINO_ASSERT(kernel_ids.size() == kernels.size(), "[GPU] The sizes of kernel_ids and kernels are different.");
void kernels_cache::add_to_cached_kernels(const std::vector<kernel::ptr>& kernels) {
static std::atomic<uint32_t> id_gen{0};
for (size_t i = 0; i < kernel_ids.size(); i++) {
const auto& kmap = std::make_pair(kernel_ids[i], kernels[i]);
_kernels.insert(kmap);
_kernel_idx++;
for (auto& kernel : kernels) {
auto ocl_kernel = std::static_pointer_cast<cldnn::ocl::ocl_kernel>(kernel);
auto program = ocl_kernel->get_handle().getInfo<CL_KERNEL_PROGRAM>();
cl::vector<unsigned char> program_binaries = getProgramBinaries(program);
std::lock_guard<std::mutex> lock(_mutex);
auto iter = _cached_binaries.find(program_binaries);
if (iter == _cached_binaries.end()) {
_cached_binaries[program_binaries] = id_gen++;
}
auto key = get_cached_kernel_id(kernel);
if (_cached_kernels.find(key) == _cached_kernels.end()) {
_cached_kernels[key] = kernel;
}
}
}
void kernels_cache::save(BinaryOutputBuffer& ob) const {
OPENVINO_ASSERT(_engine.type() == engine_types::ocl, "[GPU] Not supported engine type");
std::map<std::string, std::string> entry_point_to_id;
for (auto iter = _kernels.begin(); iter != _kernels.end(); iter++) {
std::string k_id = iter->first;
kernel::ptr kernel = iter->second;
auto ocl_kernel = std::static_pointer_cast<cldnn::ocl::ocl_kernel>(kernel);
const auto& entry_point = ocl_kernel->get_handle().getInfo<CL_KERNEL_FUNCTION_NAME>();
entry_point_to_id[entry_point] = k_id;
ob << _cached_binaries.size();
for (auto& cached_binary : _cached_binaries) {
ob << cached_binary.second;
ob << cached_binary.first;
}
ob << entry_point_to_id;
std::unique_ptr<ocl::ocl_engine> build_engine = cldnn::make_unique<ocl::ocl_engine>(_engine.get_device(), runtime_types::ocl);
std::vector<std::vector<unsigned char>> precompiled_kernels;
for (auto iter = _kernels.begin(); iter != _kernels.end(); iter++) {
kernel::ptr kernel = iter->second;
auto ocl_kernel = std::static_pointer_cast<cldnn::ocl::ocl_kernel>(kernel);
auto program = ocl_kernel->get_handle().getInfo<CL_KERNEL_PROGRAM>();
const auto& entry_point = ocl_kernel->get_handle().getInfo<CL_KERNEL_FUNCTION_NAME>();
const auto& k_id = entry_point_to_id.find(entry_point);
if (k_id != entry_point_to_id.end()) {
cl::Program::Binaries binary_kernels = {getProgramBinaries(program)};
try {
cl::vector<cl::Kernel> kernels;
cl::Program programs(build_engine->get_cl_context(), {build_engine->get_cl_device()}, binary_kernels);
programs.build({build_engine->get_cl_device()});
programs.createKernels(&kernels);
for (auto& k : kernels) {
const auto& entry_point = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
entry_point_to_id.erase(entry_point);
}
precompiled_kernels.push_back(std::move(binary_kernels[0]));
} catch (const cl::BuildError& err) {
std::string err_log = "";
for (auto& p : err.getBuildLog()) {
err_log += p.second + '\n';
}
IE_THROW() << err_log;
}
}
}
ob << precompiled_kernels;
}
void kernels_cache::load(BinaryInputBuffer& ib) {
OPENVINO_ASSERT(_engine.type() == engine_types::ocl, "[GPU] Not supported engine type");
std::unordered_map<uint32_t, std::vector<unsigned char>> precompiled_kernels;
size_t num_cached_binaries;
ib >> num_cached_binaries;
for (size_t i = 0; i < num_cached_binaries; ++i) {
uint32_t id;
ib >> id;
ib >> precompiled_kernels[id];
}
std::unique_ptr<ocl::ocl_engine> build_engine =
cldnn::make_unique<ocl::ocl_engine>(_engine.get_device(), runtime_types::ocl);
std::map<std::string, std::string> entry_point_to_id;
std::vector<std::vector<unsigned char>> precompiled_kernels;
ib >> entry_point_to_id;
ib >> precompiled_kernels;
try {
std::lock_guard<std::mutex> lock(_mutex);
_kernels.clear();
_cached_kernels.clear();
for (auto& binary_kernels : precompiled_kernels) {
for (auto& precompiled_kernel : precompiled_kernels) {
cl::vector<cl::Kernel> kernels;
cl::Program program(build_engine->get_cl_context(), {build_engine->get_cl_device()}, {binary_kernels});
cl::Program program(build_engine->get_cl_context(), {build_engine->get_cl_device()}, {precompiled_kernel.second});
program.build({build_engine->get_cl_device()});
program.createKernels(&kernels);
for (auto& k : kernels) {
const auto& entry_point = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
const auto& k_id = entry_point_to_id.find(entry_point);
if (k_id != entry_point_to_id.end()) {
std::string cached_kernel_id = entry_point + "@" + std::to_string(precompiled_kernel.first);
const auto& iter = _cached_kernels.find(cached_kernel_id);
if (iter == _cached_kernels.end()) {
cl_kernel cl_kernel = k.get();
cl_context cl_context = build_engine->get_cl_context().get();
kernel::ptr kernel = kernels_factory::create(_engine, cl_context, cl_kernel, entry_point);
_kernels.insert({k_id->second, kernel});
_kernel_idx++;
_cached_kernels[cached_kernel_id] = kernel;
}
}
}
@ -553,16 +562,15 @@ void kernels_cache::load(BinaryInputBuffer& ib) {
}
}
std::map<const std::string, kernel::ptr> kernels_cache::compile(std::vector<std::shared_ptr<kernel_string>> kernel_sources,
bool dump_custom_program) {
kernels_cache::compiled_kernels kernels_cache::compile(const kernel_impl_params& params,
const std::vector<std::shared_ptr<kernel_string>>& kernel_sources,
bool dump_custom_program) {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::Compile_ThreadSafe");
kernels_code t_kernels_code;
// Get kernels code from kernel sources
for (size_t idx = 0; idx < kernel_sources.size(); ++idx) {
auto kernel_string = kernel_sources[idx];
kernel_id id = gen_kernel_id(kernel_string->entry_point);
t_kernels_code.emplace(kernel_string, id, dump_custom_program);
for (size_t k = 0; k < kernel_sources.size(); ++k) {
t_kernels_code.insert({params, {kernel_sources, params, dump_custom_program}});
}
ocl::ocl_engine& _build_engine = downcast<ocl::ocl_engine>(_engine);
@ -571,7 +579,7 @@ std::map<const std::string, kernel::ptr> kernels_cache::compile(std::vector<std:
std::vector<batch_program> batches;
get_program_source(t_kernels_code, &batches);
std::map<const std::string, kernel::ptr> output_kernels;
compiled_kernels output_kernels;
// Build batches
for (size_t idx = 0; idx < batches.size(); ++idx) {
build_batch(_build_engine, batches[idx], output_kernels);

View File

@ -8,6 +8,7 @@
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/runtime/kernel.hpp"
#include "intel_gpu/runtime/execution_config.hpp"
#include "intel_gpu/graph/kernel_impl_params.hpp"
#include <map>
#include <mutex>
@ -25,6 +26,27 @@ namespace cldnn {
class kernels_cache {
public:
struct kernel_code {
std::vector<std::shared_ptr<kernel_string>> kernel_strings;
kernel_impl_params params;
bool dump_custom_program;
kernel_code(const std::vector<std::shared_ptr<kernel_string>>& _kernel_strings,
const kernel_impl_params& _params,
bool _dump_custom_program)
: kernel_strings(_kernel_strings),
params(_params),
dump_custom_program(_dump_custom_program) {}
};
struct impl_hasher {
size_t operator()(const kernel_impl_params &k) const {
return k.hash();
}
};
using kernels_code = std::unordered_map<kernel_impl_params, kernel_code, impl_hasher>;
using source_code = std::vector<std::string>;
struct batch_program {
int32_t bucket_id;
@ -34,7 +56,7 @@ public:
source_code source;
std::string options;
bool dump_custom_program;
std::map<std::string, std::string> entry_point_to_id;
std::map<std::string, kernel_impl_params> entry_point_to_id;
explicit batch_program(int32_t _bucket_id, int32_t _batch_id, std::string _options, const std::vector<std::string>& batch_header_str)
: bucket_id(_bucket_id),
@ -48,32 +70,7 @@ public:
}
};
struct kernel_code {
std::shared_ptr<kernel_string> kernel_strings;
std::string id;
bool dump_custom_program;
size_t hash_value;
kernel_code(const std::shared_ptr<kernel_string>& _kernel_strings,
const std::string& _id,
bool _dump_custom_program)
: kernel_strings(_kernel_strings),
id(_id),
dump_custom_program(_dump_custom_program),
hash_value(_kernel_strings->get_hash()) {}
bool operator == (const kernel_code& rhs) const {
return (hash_value == rhs.hash_value);
}
};
struct cmp_kernel_code {
bool operator()(const kernel_code& x1, const kernel_code& x2) const {
return (x1.hash_value < x2.hash_value);
}
};
using kernels_code = std::set<kernel_code, cmp_kernel_code>;
using compiled_kernels = std::unordered_map<kernel_impl_params, std::vector<kernel::ptr>, impl_hasher>;
private:
static std::mutex _mutex;
@ -82,32 +79,27 @@ private:
ExecutionConfig _config;
uint32_t _prog_id = 0;
kernels_code _kernels_code;
static std::atomic<size_t> _kernel_idx;
std::atomic<bool> _pending_compilation{false};
std::map<const std::string, kernel::ptr> _kernels;
compiled_kernels _kernels;
std::map<std::vector<unsigned char>, uint32_t> _cached_binaries;
std::unordered_map<std::string, kernel::ptr> _cached_kernels;
std::vector<std::string> batch_header_str;
void get_program_source(const kernels_code& kernels_source_code, std::vector<batch_program>*) const;
void build_batch(const engine& build_engine, const batch_program& batch, std::map<const std::string, kernel::ptr>& compiled_kernels);
void build_batch(const engine& build_engine, const batch_program& batch, compiled_kernels& compiled_kernels);
std::string get_cache_path() const;
bool is_cache_enabled() const;
size_t get_max_kernels_per_batch() const;
inline std::string gen_kernel_id(std::string entry_point) {
// we need unique id in order to avoid conflict across topologies.
return entry_point + "_" + std::to_string((_kernel_idx++));
}
public:
explicit kernels_cache(engine& engine,
const ExecutionConfig& config,
uint32_t prog_id,
InferenceEngine::CPUStreamsExecutor::Ptr task_executor = nullptr,
const std::vector<std::string>& batch_header_str = {});
kernel_id set_kernel_source(const std::shared_ptr<kernel_string>& kernel_string,
bool dump_custom_program);
kernel::ptr get_kernel(kernel_id id) const;
kernel::ptr get_kernel_from_cached_kernels(std::string id) const;
std::vector<kernel::ptr> get_kernels(kernel_impl_params params) const;
void set_batch_header_str(const std::vector<std::string> &batch_headers) {
batch_header_str = std::move(batch_headers);
}
@ -117,14 +109,20 @@ public:
// forces compilation of all pending kernels/programs
void build_all();
void reset();
void remove_kernel(kernel_id id) {
_kernels.erase(id);
}
std::vector<kernel_id> add_kernels_source(std::vector<std::shared_ptr<kernel_string>> kernel_sources, bool dump_custom_program = false);
void add_kernels(const std::vector<std::string>& kernel_ids, const std::vector<kernel::ptr>& kernels);
void add_kernels_source(const kernel_impl_params& params,
const std::vector<std::shared_ptr<kernel_string>>& kernel_sources,
bool dump_custom_program = false);
compiled_kernels compile(const kernel_impl_params& params,
const std::vector<std::shared_ptr<kernel_string>>& kernel_sources,
bool dump_custom_program = false);
std::string get_cached_kernel_id(kernel::ptr kernel) const;
std::vector<std::string> get_cached_kernel_ids(const std::vector<kernel::ptr>& kernels) const;
void add_to_cached_kernels(const std::vector<kernel::ptr>& kernels);
void save(BinaryOutputBuffer& ob) const;
void load(BinaryInputBuffer& ib);
std::map<const std::string, kernel::ptr> compile(std::vector<std::shared_ptr<kernel_string>> kernel_sources, bool dump_custom_program = false);
};
} // namespace cldnn

View File

@ -0,0 +1,92 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "test_utils.h"
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/graph/program.hpp"
#include "data_inst.h"
#include "eltwise_inst.h"
#include "reshape_inst.h"
#include "shape_of_inst.h"
#include "fully_connected_inst.h"
#include "permute_inst.h"
#include "reduce_inst.h"
#include "intel_gpu/graph/network.hpp"
#include "pass_manager.h"
#include "to_string_utils.h"
#include "program_wrapper.h"
#include <memory>
using namespace cldnn;
using namespace ::tests;
TEST(kernels_cache, reuse_kernel_for_static_model_01) {
auto& engine = get_test_engine();
auto input0 = engine.allocate_memory({{1, 1, 4, 5}, data_types::f16, format::bfyx});
auto input1 = engine.allocate_memory({{1, 1, 4, 5}, data_types::f16, format::bfyx});
auto input2 = engine.allocate_memory({{1, 1, 4, 5}, data_types::f16, format::bfyx});
auto input3 = engine.allocate_memory({{1, 1, 4, 5}, data_types::f16, format::bfyx});
auto input4 = engine.allocate_memory({{1, 1, 4, 5}, data_types::f16, format::bfyx});
auto input5 = engine.allocate_memory({{1, 1, 4, 5}, data_types::f16, format::bfyx});
auto weights1 = engine.allocate_memory({{1, 3, 2, 3 }, data_types::f16, format::bfyx});
auto weights2 = engine.allocate_memory({{1, 3, 2, 3 }, data_types::f16, format::bfyx});
topology topology(input_layout("input0", input0->get_layout()),
input_layout("input1", input1->get_layout()),
input_layout("input2", input2->get_layout()),
input_layout("input3", input3->get_layout()),
input_layout("input4", input4->get_layout()),
input_layout("input5", input5->get_layout()),
data("weights1", weights1),
data("weights2", weights2),
concatenation("concat1",
{ input_info("input0"), input_info("input1"), input_info("input2") },
1,
data_types::f16,
padding{{0, 0, 0, 0}, 0}),
convolution("conv1", input_info("concat1"), { "weights1" }, { 1, 1 }),
concatenation("concat2",
{ input_info("input3"), input_info("input4"), input_info("input5") },
1,
data_types::f16,
padding{{0, 0, 0, 0}, 0}),
convolution("conv2", input_info("concat2"), { "weights2" }, { 1, 1 }),
eltwise("sum", {input_info("concat1"), input_info("concat2")}, eltwise_mode::sum),
reorder("output", input_info("sum"), {{3, 2}, data_types::f16, format::bfyx}));
ExecutionConfig config;
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
auto prog = program::build_program(engine, topology, config, false, false);
auto& cache = prog->get_kernels_cache();
auto& conv1_node = prog->get_node("conv1");
auto& conv2_node = prog->get_node("conv2");
auto conv1_kernels = conv1_node.get_selected_impl()->get_kernels();
cache.add_to_cached_kernels(conv1_kernels);
auto conv2_kernels = conv2_node.get_selected_impl()->get_kernels();
cache.add_to_cached_kernels(conv2_kernels);
ASSERT_EQ(conv1_kernels.size(), conv2_kernels.size());
for (size_t idx = 0; idx < conv1_kernels.size(); idx++) {
auto conv1_kern = cache.get_cached_kernel_id(conv1_kernels[idx]);
auto conv2_kern = cache.get_cached_kernel_id(conv2_kernels[idx]);
ASSERT_EQ(conv1_kern, conv2_kern);
}
auto& concat1_node = prog->get_node("concat1");
auto& concat2_node = prog->get_node("concat2");
auto concat1_kernels = concat1_node.get_selected_impl()->get_kernels();
cache.add_to_cached_kernels(concat1_kernels);
auto concat2_kernels = concat2_node.get_selected_impl()->get_kernels();
cache.add_to_cached_kernels(concat2_kernels);
ASSERT_EQ(concat1_kernels.size(), concat2_kernels.size());
for (size_t idx = 0; idx < concat1_kernels.size(); idx++) {
auto concat1_kern = cache.get_cached_kernel_id(concat1_kernels[idx]);
auto concat2_kern = cache.get_cached_kernel_id(concat2_kernels[idx]);
ASSERT_EQ(concat1_kern, concat2_kern);
}
}