[GPU] Remove duplicated OpenCL kernel compilation on static model (#16262)

* * update kernel_ids using hash value * Change set to unordered_map for kernels_code * replace unique_id to hash value * Remove hash_val params * remove redundant codes (#16262) ** Remove unique_id in program_node ** Remove gen_kernel_id ** Remove set_kernels_source ** Remove remove_kernels ** Remove kernel_idx in kernels_cache * * Use kernel_impl_params instead of kernel_id * Divide batch when entry_point are duplicated * rollback removing unique_id * * Fix get_kernel failure issue (#102467) - Modify has function of custom_gpu_primitive and generic_layer - Add ==operation of generic_layer for _kernels map in kernels_cache - Fix invalid kernel_impl_params related to unique_ptr life cycle issue * Improve kernels_cache (#102467) * Move add_kernels_source step to build_implementations * Change replace kernels_code key to kernel_impl_params * Return kernel vector in get_kernels * Modify function name to get_kernels (#102467) * Fix functions related graph serialization (#102467) * Fix failure to run dynamic model (#102467) * Add unit test * Code review follow-up - Add const to input params - Add missing code to check kernel duplication in kernels_cache * Add const to input params (#102467) * [GPU] update hash and ==operator for generic_layer and custom_gpu_primitive (#102467) * [GPU] override get_kernels_source in generic_layer and custom_gpu_primitive (#102467) * [GPU] Fix onednn build error (#102467) * [GPU] Fix Lin build error (#102467) * [GPU] kernels_cache::get_kernels return vector of clone of cldnn::kernel (#102467) * Updated serialization logics for improved kernel caches (#16262) * primitive key kernel cache for serialization * kernel serialization with binaries hash * fix kernel cache init function for deserialization * removed unnecessary codes * [GPU] Update commnet and fix test failure (#16262) * [GPU] Fix custom_gpu_primitive unit test failures (#16262) * [GPU] Improved kernels cache serialization (#16262) * removed hash in serialization logic * update not to create a new kernels_cache for serialization * code refactoring in serialization logic * [GPU] Follow-up code review (#16262) * [GPU] modify lock(#16262) * [GPU] Fix custom_gpu_primitive unit test failure (#16262) --------- Co-authored-by: Eddy Kim <eddy.kim@intel.com>
2023-03-29 01:48:19 +09:00 · 2023-03-29 01:48:19 +09:00 · 253e4eb366
commit 253e4eb366
parent 17c3e67336
39 changed files with 576 additions and 311 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
@ -239,16 +239,13 @@ public:
                             std::shared_ptr<InferenceEngine::CPUStreamsExecutor> task_executor,
                             bool is_internal);
    static void init_primitives();
-    kernel_id add_kernel(const std::shared_ptr<kernel_string>& kernel_sring);
-    kernel::ptr get_kernel(kernel_id id);
    kernels_cache& get_kernels_cache() const;

    // returns {-1, -1} if it failed to estimate by allocating given batch size
    std::pair<int64_t/*const alloc*/, int64_t/*general alloc*/> get_estimated_device_mem_usage();

-    void remove_kernel(kernel_id id);
-
    using ImplementationsCache = cldnn::LruCacheThreadSafe<kernel_impl_params, std::shared_ptr<primitive_impl>, kernel_impl_params::Hasher>;
+
    ImplementationsCache& get_implementations_cache() const { return *_impls_cache; }
    ICompilationContext& get_compilation_context() const { return *_compilation_context; }
    void cancel_compilation_context();
--- a/src/plugins/intel_gpu/include/intel_gpu/graph/serialization/binary_buffer.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/graph/serialization/binary_buffer.hpp
@ -23,8 +23,8 @@ public:
            "[GPU] Failed to write " + std::to_string(size) + " bytes to stream! Wrote " + std::to_string(written_size));
    }

-    void setKernlImplParams(void* impl_params) { _impl_params = impl_params; }
-    void* getKernlImplParams() const { return _impl_params; }
+    void setKernelImplParams(void* impl_params) { _impl_params = impl_params; }
+    void* getKernelImplParams() const { return _impl_params; }

 private:
    std::ostream& stream;
@ -42,8 +42,8 @@ public:
            "[GPU] Failed to read " + std::to_string(size) + " bytes from stream! Read " + std::to_string(read_size));
    }

-    void setKernlImplParams(void* impl_params) { _impl_params = impl_params; }
-    void* getKernlImplParams() const { return _impl_params; }
+    void setKernelImplParams(void* impl_params) { _impl_params = impl_params; }
+    void* getKernelImplParams() const { return _impl_params; }
    void setNetwork(void* network) { _network = network; }
    void* getNetwork() const { return _network; }

--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/custom_gpu_primitive.hpp
@ -29,6 +29,10 @@ struct custom_gpu_primitive : public primitive_base<custom_gpu_primitive> {
    struct arg_desc {
        arg_type type;
        arg_index index;
+
+        bool operator==(const arg_desc& rhs) const {
+            return (type == rhs.type && index == rhs.index);
+        }
    };

    /// @brief Constructs custom_gpu_primitive primitive
@ -77,7 +81,14 @@ struct custom_gpu_primitive : public primitive_base<custom_gpu_primitive> {
    size_t hash() const override {
        size_t seed = primitive::hash();
        seed = hash_combine(seed, kernel_entry_point);
-        seed = hash_combine(seed, kernels_code.size());
+        for (auto& args : kernel_arguments) {
+            seed = hash_combine(seed, args.index);
+            seed = hash_combine(seed, args.type);
+        }
+        seed = hash_combine(seed, build_options);
+        seed = hash_range(seed, kernels_code.begin(), kernels_code.end());
+        seed = hash_range(seed, gws.begin(), gws.end());
+        seed = hash_range(seed, lws.begin(), lws.end());
        return seed;
    }

@ -87,9 +98,25 @@ struct custom_gpu_primitive : public primitive_base<custom_gpu_primitive> {

        auto rhs_casted = downcast<const custom_gpu_primitive>(rhs);

-        return kernel_entry_point == rhs_casted.kernel_entry_point &&
-               build_options == rhs_casted.build_options &&
-               kernels_code.size() == rhs_casted.kernels_code.size();
+        if (kernel_entry_point != rhs_casted.kernel_entry_point)
+            return false;
+
+        if (build_options != rhs_casted.build_options)
+            return false;
+
+        if (kernel_arguments != rhs_casted.kernel_arguments)
+            return false;
+
+        if (kernels_code != rhs_casted.kernels_code)
+            return false;
+
+        if (gws != rhs_casted.gws)
+            return false;
+
+        if (lws != rhs_casted.lws)
+            return false;
+
+        return true;
    }
 };
 }  // namespace cldnn
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp
@ -531,7 +531,8 @@ struct layout {

        auto pshape = get_partial_shape();
        for (size_t idx = 0; idx < pshape.size(); idx++) {
-            seed = hash_combine(seed, pshape[idx].get_length());
+            auto v = pshape[idx].is_dynamic() ? -1 : pshape[idx].get_length();
+            seed = hash_combine(seed, v);
        }
        return seed;
    }
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp
@ -16,11 +16,18 @@ void build_implementations::run(program& p) {
    }

    auto& cache = p.get_kernels_cache();
+    for (auto& n : p.get_processing_order()) {
+        if (auto impl = n->get_selected_impl()) {
+            auto params = n->get_kernel_impl_params();
+            cache.add_kernels_source(*params, impl->get_kernels_source());
+        }
+    }
    cache.build_all();
    for (auto& n : p.get_processing_order()) {
-        if (n->get_selected_impl()) {
-            n->get_selected_impl()->init_kernels(cache);
-            n->get_selected_impl()->reset_kernels_source();
+        if (auto impl = n->get_selected_impl()) {
+            auto params = n->get_kernel_impl_params();
+            impl->init_kernels(cache, *params);
+            impl->reset_kernels_source();
        }
    }
    cache.reset();
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp
@ -65,13 +65,9 @@ void compile_graph::run(program& p) {
            can_select_impl = false;

        if (can_select_impl) {
-            tasks.push_back([node, &p, &exception] {
+            tasks.push_back([node, &exception] {
                try {
                    node->selected_impl = node->type()->choose_impl(*node);
-                    if (node->selected_impl) {
-                        auto kernel_ids = p.get_kernels_cache().add_kernels_source(node->selected_impl->get_kernels_source());
-                        node->selected_impl->set_kernel_ids(kernel_ids);
-                    }
                } catch(...) {
                    exception = std::current_exception();
                }
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_input_reorder.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_input_reorder.cpp
@ -70,8 +70,8 @@ void post_input_reorder::run(program& p) {
                node->set_output_layout(previous_layout, false);
                reorder.set_selected_impl(reorder.type()->choose_impl(reorder));
                if (auto impl = reorder.get_selected_impl()) {
-                    auto kernel_ids = p.get_kernels_cache().add_kernels_source(impl->get_kernels_source());
-                    impl->set_kernel_ids(kernel_ids);
+                    auto params = reorder.get_kernel_impl_params();
+                    p.get_kernels_cache().add_kernels_source(*params, impl->get_kernels_source());
                }
            }
        }
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp
@ -57,8 +57,8 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
            if ((!g_node.is_constant()) && (!reorder.second)) {
                g_node.set_selected_impl(g_node.type()->choose_impl(g_node));
                if (auto impl = g_node.get_selected_impl()) {
-                    auto kernel_ids = p.get_kernels_cache().add_kernels_source(impl->get_kernels_source());
-                    impl->set_kernel_ids(kernel_ids);
+                    auto params = g_node.get_kernel_impl_params();
+                    p.get_kernels_cache().add_kernels_source(*params, impl->get_kernels_source());
                }
            }
        }
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
@ -36,8 +36,8 @@ void remove_redundant_reorders::run(program& p) {
        node.set_unique_id();
        node.set_selected_impl(node.type()->choose_impl(node));
        if (auto impl = node.get_selected_impl()) {
-            auto kernel_ids = p.get_kernels_cache().add_kernels_source(impl->get_kernels_source());
-            impl->set_kernel_ids(kernel_ids);
+            auto params = node.get_kernel_impl_params();
+            p.get_kernels_cache().add_kernels_source(*params, impl->get_kernels_source());
        }
    };

--- a/src/plugins/intel_gpu/src/graph/impls/common/condition.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/common/condition.cpp
@ -54,7 +54,7 @@ struct condition_impl : typed_primitive_impl<condition> {
        return make_unique<condition_impl>(arg);
    }

-    void init_kernels(const kernels_cache&) override {}
+    void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}

 private:
    primitive_id _node_id;
--- a/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp
@ -23,7 +23,7 @@ struct loop_impl : typed_primitive_impl<loop> {
        return make_unique<loop_impl>(*this);
    }

-    void init_kernels(const kernels_cache&) override {}
+    void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}

    loop_impl() : parent() {}

--- a/src/plugins/intel_gpu/src/graph/impls/common/wait_for_events.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/common/wait_for_events.cpp
@ -29,7 +29,7 @@ public:
        return make_unique<wait_for_events_impl>(*this);
    }

-    void init_kernels(const kernels_cache&) override {}
+    void init_kernels(const kernels_cache&, const kernel_impl_params&) override {}
    void set_arguments(primitive_inst& /*instance*/) override {}
    kernel_arguments_data get_arguments(const primitive_inst& /*instance*/) const override {
        kernel_arguments_data args;
--- a/src/plugins/intel_gpu/src/graph/impls/cpu/assign.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/cpu/assign.cpp
@ -61,7 +61,7 @@ struct assign_impl : public typed_primitive_impl<assign> {
        return ev_set_memory;
    }

-    void init_kernels(const kernels_cache&) override {}
+    void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}

 public:
    static std::unique_ptr<primitive_impl> create(const assign_node& arg, const kernel_impl_params& impl_param) {
--- a/src/plugins/intel_gpu/src/graph/impls/cpu/detection_output.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/cpu/detection_output.cpp
@ -845,7 +845,7 @@ public:
        return ev;
    }

-    void init_kernels(const kernels_cache&) override {}
+    void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}

    static std::unique_ptr<primitive_impl> create(const detection_output_node& arg, const kernel_impl_params&) {
        return make_unique<detection_output_impl>(arg);
--- a/src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp
@ -419,7 +419,7 @@ struct non_max_suppression_impl : typed_primitive_impl<non_max_suppression> {
    static std::unique_ptr<primitive_impl> create(const non_max_suppression_node&, const kernel_impl_params&) {
        return make_unique<non_max_suppression_impl>();
    }
-    void init_kernels(const kernels_cache&) override {}
+    void init_kernels(const kernels_cache&, const kernel_impl_params&) override {}
 };
 namespace detail {

--- a/src/plugins/intel_gpu/src/graph/impls/cpu/proposal.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/cpu/proposal.cpp
@ -426,7 +426,7 @@ struct proposal_impl : typed_primitive_impl<proposal> {
        return ev;
    }

-    void init_kernels(const kernels_cache&) override {}
+    void init_kernels(const kernels_cache&, const kernel_impl_params&) override {}

    static std::unique_ptr<primitive_impl> create(const proposal_node& arg, const kernel_impl_params& impl_param) {
        const layout& l = impl_param.input_layouts[2];
--- a/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp
@ -62,7 +62,7 @@ struct read_value_impl : public typed_primitive_impl<read_value> {
        return instance.get_network().get_stream().create_user_event(true);
    }

-    void init_kernels(const kernels_cache&) override {}
+    void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}

 public:
    static std::unique_ptr<primitive_impl> create(const read_value_node& arg, const kernel_impl_params& impl_param) {
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp
@ -28,7 +28,7 @@ struct custom_gpu_primitive_impl : typed_primitive_impl<custom_gpu_primitive> {

    std::shared_ptr<kernel_selector::cl_kernel_data> cl_kernel;
    std::vector<kernel::ptr> _kernels;
-    kernel_id _kernel_id;
+    std::string _cached_kernel_id;

    std::unique_ptr<primitive_impl> clone() const override {
        return make_unique<custom_gpu_primitive_impl>(*this);
@ -40,7 +40,7 @@ struct custom_gpu_primitive_impl : typed_primitive_impl<custom_gpu_primitive> {
    custom_gpu_primitive_impl(const custom_gpu_primitive_impl& other)
    : cl_kernel(other.cl_kernel)
    , _kernels({})
-    , _kernel_id(other._kernel_id) {
+    , _cached_kernel_id(other._cached_kernel_id) {
        for (const auto& kernel : other._kernels) {
            _kernels.emplace_back(kernel->clone());
        }
@ -49,12 +49,27 @@ struct custom_gpu_primitive_impl : typed_primitive_impl<custom_gpu_primitive> {
    custom_gpu_primitive_impl(const custom_gpu_primitive_node& arg,
                             std::shared_ptr<kernel_selector::cl_kernel_data>& cl_kernel)
        : cl_kernel(cl_kernel)
-        , _kernels() {
-        _kernel_id = arg.get_program().add_kernel(cl_kernel->code.kernelString);
+        , _kernels()
+        , _cached_kernel_id() { }
+
+    std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() override {
+        std::vector<std::shared_ptr<cldnn::kernel_string>> kernel_strings;
+        kernel_strings.push_back(cl_kernel->code.kernelString);
+        return kernel_strings;
    }

-    void init_kernels(const kernels_cache& kernels_cache) override {
-        _kernels.emplace_back(kernels_cache.get_kernel(_kernel_id));
+    void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override {
+        _kernels.clear();
+        auto compiled_kernels = kernels_cache.get_kernels(params);
+        _kernels.insert(_kernels.begin(), compiled_kernels.begin(), compiled_kernels.end());
+    }
+
+    void init_by_cached_kernels(const kernels_cache& kernels_cache) override {
+        _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(_cached_kernel_id));
+    }
+
+    void set_cached_kernel_ids(const kernels_cache& kernels_cache) override {
+        _cached_kernel_id = kernels_cache.get_cached_kernel_id(_kernels[0]);
    }

    void set_arguments_impl(custom_gpu_primitive_inst& instance) override {
@ -78,23 +93,19 @@ struct custom_gpu_primitive_impl : typed_primitive_impl<custom_gpu_primitive> {
        return stream.enqueue_kernel(*_kernels.front(), cl_kernel.get()->params, args, events, instance.is_output());
    }

-    std::vector<std::string> get_kernel_ids() const override {
-        return {_kernel_id};
-    }
-
    std::vector<kernel::ptr> get_kernels() const override {
        return _kernels;
    }

    void save(BinaryOutputBuffer& ob) const override {
        ob << *cl_kernel;
-        ob << _kernel_id;
+        ob << _cached_kernel_id;
    }

    void load(BinaryInputBuffer& ib) override {
        cl_kernel = std::make_shared<kernel_selector::cl_kernel_data>();
        ib >> *cl_kernel;
-        ib >> _kernel_id;
+        ib >> _cached_kernel_id;
    }
 };

--- a/src/plugins/intel_gpu/src/graph/impls/ocl/generic_layer.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/generic_layer.cpp
@ -15,7 +15,7 @@ struct generic_layer_impl : typed_primitive_impl<generic_layer> {

    kernel_selector::cl_kernel_data _cl_kernel_data;
    std::vector<kernel::ptr> _kernels;
-    kernel_id _kernel_id;
+    std::string _cached_kernel_id;

    DECLARE_OBJECT_TYPE_SERIALIZATION

@ -28,7 +28,7 @@ struct generic_layer_impl : typed_primitive_impl<generic_layer> {
    generic_layer_impl(const generic_layer_impl& other)
    : _cl_kernel_data(other._cl_kernel_data)
    , _kernels({})
-    , _kernel_id(other._kernel_id) {
+    , _cached_kernel_id(other._cached_kernel_id) {
        if (other._kernels.empty()) {
            throw std::runtime_error("Can't copy generic_layer_impl node: kernels vector is empty");
        }
@ -37,22 +37,41 @@ struct generic_layer_impl : typed_primitive_impl<generic_layer> {

    generic_layer_impl(const generic_layer_node& arg)
        : _cl_kernel_data(*arg.get_primitive()->generic_params.clKernel.get())
-        , _kernels() {
-        _kernel_id = arg.get_program().add_kernel(arg.get_primitive()->generic_params.clKernel->code.kernelString);
+        , _kernels()
+        , _cached_kernel_id() { }
+
+    std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() override {
+        std::vector<std::shared_ptr<cldnn::kernel_string>> kernel_strings;
+        kernel_strings.push_back(_cl_kernel_data.code.kernelString);
+        return kernel_strings;
+    }
+
+    std::vector<kernel::ptr> get_kernels() const override {
+        return _kernels;
    }

    void save(BinaryOutputBuffer& ob) const override {
        ob <<_cl_kernel_data;
-        ob << _kernel_id;
+        ob << _cached_kernel_id;
    }

    void load(BinaryInputBuffer& ib) override {
        ib >> _cl_kernel_data;
-        ib >> _kernel_id;
+        ib >> _cached_kernel_id;
    }

-    void init_kernels(const kernels_cache& kernels_cache) override {
-        _kernels.push_back(kernels_cache.get_kernel(_kernel_id));
+    void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override {
+        _kernels.clear();
+        auto compiled_kernels = kernels_cache.get_kernels(params);
+        _kernels.insert(_kernels.begin(), compiled_kernels.begin(), compiled_kernels.end());
+    }
+
+    void init_by_cached_kernels(const kernels_cache& kernels_cache) override {
+        _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(_cached_kernel_id));
+    }
+
+    void set_cached_kernel_ids(const kernels_cache& kernels_cache) override {
+        _cached_kernel_id = kernels_cache.get_cached_kernel_id(_kernels[0]);
    }

    void set_arguments_impl(generic_layer_inst& instance) override {
@ -114,7 +133,7 @@ struct generic_layer_cpu : typed_primitive_impl<generic_layer> {
        return ev;
    }

-    void init_kernels(const kernels_cache&) override {}
+    void init_kernels(const kernels_cache&, const kernel_impl_params&) override {}
 };

 static std::unique_ptr<primitive_impl> create(const generic_layer_node& arg, const kernel_impl_params&) {
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
@ -101,12 +101,13 @@ bool query_local_block_io_supported(engine& e, const ExecutionConfig& config) {
    kernel_string->batch_compilation = true;

    try {
+        kernel_impl_params dummy_params;
        auto _kernels_cache_device_query = std::unique_ptr<kernels_cache>(new kernels_cache(e, config, 0));
-        auto id = _kernels_cache_device_query->set_kernel_source(kernel_string, false);
+        _kernels_cache_device_query->add_kernels_source(dummy_params, {kernel_string}, false);
        _kernels_cache_device_query->build_all();

-        auto kernel = _kernels_cache_device_query->get_kernel(id);
-        cache[device] = _kernels_cache_device_query->validate_simple_kernel_execution(kernel);
+        auto _kernels = _kernels_cache_device_query->get_kernels(dummy_params);
+        cache[device] = _kernels_cache_device_query->validate_simple_kernel_execution(_kernels[0]);
    } catch (std::exception& /*ex*/) {
        cache[device] = false;
    }
@ -1202,7 +1203,7 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
    const auto& config = program->get_config();
    const auto& device_info = engine.get_device_info();

-    params.uniqueID = std::to_string(param_info.unique_id);
+    params.uniqueID = std::to_string(param_info.hash());
    params.engineInfo.supports_fp16 = device_info.supports_fp16;
    params.engineInfo.supports_fp64 = device_info.supports_fp64;
    params.engineInfo.supports_fp16_denorms = device_info.supports_fp16_denorms;
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
@ -32,10 +32,10 @@ For example, all gpu convolution implementations should derive from typed_primit
 template <class PType>
 struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
    kernel_selector::kernel_data _kernel_data;
-    std::vector<kernel_id> _kernel_ids;
+    std::vector<std::string> _cached_kernel_ids;
    std::vector<kernel::ptr> _kernels;

-    typed_primitive_impl_ocl() :  _kernel_data({}), _kernel_ids({}), _kernels({}) {
+    typed_primitive_impl_ocl() :  _kernel_data({}), _cached_kernel_ids({}), _kernels({}) {
        _kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE;
        _kernel_data.weightsReorderParams.cpuKernel = nullptr;
        _kernel_data.weightsReorderParams.clKernel = nullptr;
@ -44,7 +44,7 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
    typed_primitive_impl_ocl(const typed_primitive_impl_ocl<PType>& other)
    : typed_primitive_impl<PType>(other._weights_reorder_params, other._kernel_name, other._is_dynamic)
    , _kernel_data(other._kernel_data)
-    , _kernel_ids(other._kernel_ids)
+    , _cached_kernel_ids(other._cached_kernel_ids)
    , _kernels({}) {
        _kernels.reserve(other._kernels.size());
        for (size_t k = 0; k < other._kernels.size(); ++k) {
@ -68,20 +68,19 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {

    // Cache blob format:
    //     [ kernel_selector::kernel_data ]
-    //     [ kernel_id ]
-    //     [ kernel_arguments ]
+    //     [ kernel_ids ]
    void save(BinaryOutputBuffer& ob) const override {
        ob << make_data(&_kernel_data.internalBufferDataType, sizeof(kernel_selector::Datatype));
        ob << _kernel_data.internalBufferSizes;
        ob << _kernel_data.kernels;
-        ob << _kernel_ids;
+        ob << _cached_kernel_ids;
    }

    void load(BinaryInputBuffer& ib) override {
        ib >> make_data(&_kernel_data.internalBufferDataType, sizeof(kernel_selector::Datatype));
        ib >> _kernel_data.internalBufferSizes;
        ib >> _kernel_data.kernels;
-        ib >> _kernel_ids;
+        ib >> _cached_kernel_ids;
    }

    template<typename ImplType>
@ -134,20 +133,32 @@ protected:
        return stream.enqueue_marker(events, is_output);
    }

-    void init_kernels(const kernels_cache& kernels_cache) override {
+    void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override {
+        if (is_cpu()) {
+            return;
+        }
+
+        _kernels.clear();
+        if (!_kernel_data.kernels.empty()) {
+            auto compiled_kernels = kernels_cache.get_kernels(params);
+            _kernels.insert(_kernels.begin(), compiled_kernels.begin(), compiled_kernels.end());
+        }
+    }
+
+    void init_by_cached_kernels(const kernels_cache& kernels_cache) override {
        if (is_cpu()) {
            return;
        }
        _kernels.clear();

-        _kernels.reserve(_kernel_ids.size());
-        for (size_t k = 0; k < _kernel_ids.size(); ++k) {
-            _kernels.emplace_back(kernels_cache.get_kernel(_kernel_ids[k]));
+        _kernels.reserve(_cached_kernel_ids.size());
+        for (size_t k = 0; k < _cached_kernel_ids.size(); ++k) {
+            _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(_cached_kernel_ids[k]));
        }
    }

-    std::vector<std::string> get_kernel_ids() const override {
-        return _kernel_ids;
+    void set_cached_kernel_ids(const kernels_cache& kernels_cache) override {
+        _cached_kernel_ids = kernels_cache.get_cached_kernel_ids(_kernels);
    }

    std::vector<kernel::ptr> get_kernels() const override {
@ -258,10 +269,6 @@ protected:
        return aggregate_events(all_events, stream, group_events);
    }

-    void set_kernel_ids(std::vector<kernel_id> kernel_ids) override {
-        _kernel_ids = kernel_ids;
-    }
-
    std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() override {
        std::vector<std::shared_ptr<cldnn::kernel_string>> kernel_strings;
        for (size_t i = 0; i < _kernel_data.kernels.size(); ++i) {
@ -283,18 +290,26 @@ protected:
        }
    }

-    void set_kernels(std::map<const std::string, kernel::ptr>& kernels) override {
+    void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override {
        if (is_cpu())
            return;

-        _kernel_ids.clear();
+        size_t total_kernels_num = std::accumulate(kernels.begin(), kernels.end(), 0,
+            [](size_t val, cldnn::kernels_cache::compiled_kernels::value_type& p) {
+                return (val + p.second.size());
+            });
+
        _kernels.clear();
-        _kernels.reserve(kernels.size());
+        _kernels.reserve(total_kernels_num);
+
        for (auto& k : kernels) {
-            _kernel_ids.push_back(k.first);
-            _kernels.emplace_back(std::move(k.second));
+            _kernels.insert(_kernels.end(), k.second.begin(), k.second.end());
        }
    }
+
+    std::vector<kernel::ptr> get_kernels() override {
+        return _kernels;
+    }
 };

 }  // namespace ocl
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/concatenation_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/concatenation_onednn.cpp
@ -78,7 +78,7 @@ public:

        parent::save(ob);

-        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ob.getKernlImplParams());
+        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ob.getKernelImplParams());
        auto prim = impl_params->typed_desc<concatenation>();
        ob << prim->axis;

@ -101,7 +101,7 @@ public:
        int64_t prim_axis;
        ib >> prim_axis;

-        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernlImplParams());
+        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());
        auto prim_desc = get_concatenation_primitive_descriptor(*impl_params, ib.get_engine(), *_attrs, prim_axis);
        _pd = *prim_desc;

--- a/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp
@ -196,7 +196,7 @@ public:
            _attrs->set_zero_points_mask(DNNL_ARG_SRC, _zero_point_mask);
        }

-        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernlImplParams());
+        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());

        auto input_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(0), dnnl::memory::format_tag::undef);
        auto weights_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(1), dnnl::memory::format_tag::any);
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.cpp
@ -108,7 +108,7 @@ public:
 #ifdef ONEDNN_PRIMITIVE_SERIALIZATION
        parent::load(ib);

-        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernlImplParams());
+        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());

        auto input_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(0), dnnl::memory::format_tag::undef);
        auto weights_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(1), dnnl::memory::format_tag::any);
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp
@ -159,7 +159,7 @@ public:
 #ifdef ONEDNN_PRIMITIVE_SERIALIZATION
        parent::save(ob);

-        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ob.getKernlImplParams());
+        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ob.getKernelImplParams());
        auto prim = impl_params->typed_desc<fully_connected>();
        size_t input_size = prim->input_size;
        bool has_bias = !prim->bias.empty();
@ -181,7 +181,7 @@ public:
        ib >> input_size;
        ib >> has_bias;

-        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernlImplParams());
+        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());
        auto prim_desc = get_fully_connected_primitive_descriptor(*impl_params, ib.get_engine(), input_size, has_bias, *_attrs);
        _pd = *prim_desc;

--- a/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp
@ -178,7 +178,7 @@ public:
 #ifdef ONEDNN_PRIMITIVE_SERIALIZATION
        parent::save(ob);

-        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ob.getKernlImplParams());
+        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ob.getKernelImplParams());
        auto prim = impl_params->typed_desc<gemm>();
        bool gemm_with_bias = prim->dependencies().size() == 3;

--- a/src/plugins/intel_gpu/src/graph/impls/onednn/pooling_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/pooling_onednn.cpp
@ -108,7 +108,7 @@ public:
 #ifdef ONEDNN_PRIMITIVE_SERIALIZATION
        parent::load(ib);

-        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernlImplParams());
+        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());

        dnnl::algorithm alg;
        ib >> make_data(&alg, sizeof(dnnl::algorithm));
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h
@ -200,7 +200,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
                _attrs->set_fpmath_mode(_fmath_mode);
            }
            {
-                const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernlImplParams());
+                const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());
                const std::vector<cldnn::fused_primitive_desc_onednn>& fused_desc = impl_params->fused_desc_onednn;
                dnnl::post_ops _post_ops;
                int post_ops_len;
@ -451,7 +451,7 @@ protected:
        return args;
    }

-    void init_kernels(const kernels_cache&) override { }
+    void init_kernels(const kernels_cache&, const kernel_impl_params&) override { }

    event::ptr aggregate_events(const std::vector<event::ptr>& events, stream& stream, bool group = false, bool is_output = false) const {
        if (events.size() == 1 && !is_output)
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/reduction_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/reduction_onednn.cpp
@ -119,7 +119,7 @@ public:
 #ifdef ONEDNN_PRIMITIVE_SERIALIZATION
        parent::load(ib);

-        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernlImplParams());
+        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());

        dnnl::algorithm alg;
        ib >> make_data(&alg, sizeof(dnnl::algorithm));
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/reorder_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/reorder_onednn.cpp
@ -77,7 +77,7 @@ public:
 #ifdef ONEDNN_PRIMITIVE_SERIALIZATION
        parent::load(ib);

-        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernlImplParams());
+        const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());

        auto input_md = onednn::layout_to_memory_desc(impl_params->get_input_layout(0));
        auto output_md = onednn::layout_to_memory_desc(impl_params->get_output_layout());
--- a/src/plugins/intel_gpu/src/graph/include/generic_layer.hpp
+++ b/src/plugins/intel_gpu/src/graph/include/generic_layer.hpp
@ -38,10 +38,102 @@ struct generic_layer : public primitive_base<generic_layer> {

    size_t hash() const override {
        size_t seed = primitive::hash();
-        seed = hash_combine(seed, id);
+        seed = hash_combine(seed, generic_params.engine);
+
+        if (generic_params.cpuKernel != nullptr) {
+            auto& cpuKernel = generic_params.cpuKernel;
+            seed = hash_combine(seed, cpuKernel->GetExpectedInputLayout());
+            seed = hash_combine(seed, cpuKernel->GetExpectedInputType());
+        }
+
+        if (generic_params.clKernel != nullptr) {
+            auto& clKernel = generic_params.clKernel;
+            seed = hash_combine(seed, clKernel->skip_execution);
+
+            auto& gws = clKernel->params.workGroups.global;
+            seed = hash_range(seed, gws.begin(), gws.end());
+
+            auto& lws = clKernel->params.workGroups.local;
+            seed = hash_range(seed, lws.begin(), lws.end());
+
+            auto& arguments = clKernel->params.arguments;
+            for (auto& args : arguments) {
+                seed = hash_combine(seed, args.index);
+                seed = hash_combine(seed, args.t);
+            }
+
+            auto& scalars = clKernel->params.scalars;
+            for (auto& s : scalars) {
+                seed = hash_combine(seed, s.t);
+            }
+
+            seed = hash_combine(seed, clKernel->code.kernelString->get_hash());
+        }
        return seed;
    }

+    bool operator==(const primitive& rhs) const override {
+        if (!compare_common_params(rhs))
+            return false;
+
+        auto rhs_casted = downcast<const generic_layer>(rhs);
+
+        if (generic_params.engine != rhs_casted.generic_params.engine)
+            return false;
+
+        if (generic_params.cpuKernel != nullptr) {
+            if (generic_params.cpuKernel->GetExpectedInputLayout() != rhs_casted.generic_params.cpuKernel->GetExpectedInputLayout())
+                return false;
+
+            if (generic_params.cpuKernel->GetExpectedInputType() != rhs_casted.generic_params.cpuKernel->GetExpectedInputType())
+                return false;
+        }
+
+        if (generic_params.clKernel != nullptr) {
+            auto& clKernel = generic_params.clKernel;
+            auto& clKernel_rhs = rhs_casted.generic_params.clKernel;
+            if (clKernel->skip_execution != clKernel_rhs->skip_execution)
+                return false;
+
+            auto& gws       = clKernel->params.workGroups.global;
+            auto& gws_rhs   = clKernel_rhs->params.workGroups.global;
+            if (gws != gws_rhs)
+                return false;
+
+            auto& lws       = clKernel->params.workGroups.local;
+            auto& lws_rhs   = clKernel_rhs->params.workGroups.local;
+            if (lws != lws_rhs)
+                return false;
+
+            auto& arguments     = clKernel->params.arguments;
+            auto& arguments_rhs = clKernel_rhs->params.arguments;
+            if (arguments.size() != arguments_rhs.size())
+                return false;
+
+            for (size_t idx = 0; idx < arguments.size(); idx++) {
+                if (arguments[idx].index != arguments_rhs[idx].index)
+                    return false;
+
+                if (arguments[idx].t != arguments_rhs[idx].t)
+                    return false;
+            }
+
+            auto& scalars     = clKernel->params.scalars;
+            auto& scalars_rhs = clKernel_rhs->params.scalars;
+            if (scalars.size() != scalars_rhs.size())
+                return false;
+
+            for (size_t idx = 0; idx < scalars.size(); idx++) {
+                if (scalars[idx].t != scalars_rhs[idx].t)
+                    return false;
+            }
+
+            if (clKernel->code.kernelString->get_str() != clKernel_rhs->code.kernelString->get_str())
+                return false;
+        }
+        return true;
+    }
+
 protected:
    std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override { return {}; }
 };
--- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
@ -59,15 +59,13 @@ struct primitive_impl {
    kernel_selector::weights_reorder_params _weights_reorder_params;
    // class typed_primitive_gpu_impl override this with return false;
    virtual bool is_cpu() const { return true; }
-    virtual void init_kernels(const kernels_cache&) = 0;
+    virtual void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) = 0;
+    virtual void init_by_cached_kernels(const kernels_cache&) {}
+    virtual void set_cached_kernel_ids(const kernels_cache&) {}
    virtual std::unique_ptr<primitive_impl> clone() const = 0;
-    virtual std::vector<std::string> get_kernel_ids() const {
-        return {};
-    }
    virtual std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() { return {}; }
    virtual void reset_kernels_source() {}
    virtual std::vector<kernel::ptr> get_kernels() const { return {}; }
-    virtual void set_kernel_ids(std::vector<kernel_id> kernel_ids) {}
    virtual void save(cldnn::BinaryOutputBuffer& ob) const {}
    virtual void load(cldnn::BinaryInputBuffer& ib) {}

@ -88,7 +86,8 @@ struct primitive_impl {
        return primitive_impl::static_canonicalize_shapes(impl_params);
    }

-    virtual void set_kernels(std::map<const std::string, kernel::ptr>& kernels) {}
+    virtual void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) {}
+    virtual std::vector<kernel::ptr> get_kernels() { return {}; }

 protected:
    std::string _kernel_name;
@ -163,8 +162,13 @@ public:

    event::ptr execute(const std::vector<event::ptr>& events);
    void init_kernels(const kernels_cache& kernels_cache) {
-        _impl->init_kernels(kernels_cache);
+        _impl->init_kernels(kernels_cache, *_impl_params);
    }
+
+    void init_by_cached_kernels(const kernels_cache& kernels_cache) {
+        _impl->init_by_cached_kernels(kernels_cache);
+    }
+
    void set_arguments();

    void validate() const {
--- a/src/plugins/intel_gpu/src/graph/kernel_impl_params.cpp
+++ b/src/plugins/intel_gpu/src/graph/kernel_impl_params.cpp
@ -15,7 +15,9 @@
 namespace cldnn {

 size_t kernel_impl_params::hash() const {
-    size_t seed = desc->hash();
+    size_t seed = 0;
+    if (desc != nullptr)
+        seed = desc->hash();
    const size_t prime_number = 2654435761; // magic number to reduce hash collision rate.
    for (auto& in : input_layouts) {
        seed = hash_combine(seed, in.hash() * prime_number);
@ -32,7 +34,10 @@ size_t kernel_impl_params::hash() const {
 }

 bool kernel_impl_params::operator==(const kernel_impl_params& rhs) const {
-    if (*desc != *rhs.desc)
+    if ((desc != nullptr && rhs.desc == nullptr) || (desc == nullptr && rhs.desc != nullptr))
+        return false;
+
+    if ((desc != nullptr && rhs.desc != nullptr) && *desc != *rhs.desc)
        return false;

    if (rhs.input_layouts.size() != input_layouts.size())
--- a/src/plugins/intel_gpu/src/graph/network.cpp
+++ b/src/plugins/intel_gpu/src/graph/network.cpp
@ -405,7 +405,7 @@ network::network(cldnn::BinaryInputBuffer& ib, const ExecutionConfig& config, st
        ib >> *p_inst;
        _primitives[p_inst->id()] = p_inst;
        if (p_inst->get_impl() != nullptr)
-            p_inst->init_kernels(kernels_cache);
+            p_inst->init_by_cached_kernels(kernels_cache);
    }

    for (auto& item : _primitives) {
@ -515,10 +515,12 @@ network::~network() {
 //     [ executable primitive_inst ]
 //     [ memory reuse information ]
 void network::save(cldnn::BinaryOutputBuffer& ob) {
-    kernels_cache kernels_cache(get_engine(), _config, 0, nullptr, {""});
+    auto& kernels_cache = _program->get_kernels_cache();
+    kernels_cache.reset();
    for (const auto& p_inst : _exec_order) {
-        if (p_inst->get_impl() != nullptr)
-            kernels_cache.add_kernels(p_inst->get_impl()->get_kernel_ids(), p_inst->get_impl()->get_kernels());
+        if (p_inst->get_impl() != nullptr) {
+            kernels_cache.add_to_cached_kernels(p_inst->get_impl()->get_kernels());
+        }
    }
    ob << kernels_cache;

@ -597,6 +599,7 @@ void network::save(cldnn::BinaryOutputBuffer& ob) {
    }

    ob << get_ext_id_mapping();
+    kernels_cache.reset();
 }

 network::ptr network::allocate_network(stream::ptr stream, program::ptr program, bool is_internal, bool is_primary_stream) {
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@ -383,7 +383,7 @@ bool primitive_inst::update_impl() {
                    }

                    auto impl = _node->type()->choose_impl(*_node, updated_params);
-                    auto kernels = _program->get_kernels_cache().compile(impl->get_kernels_source());
+                    auto kernels = _program->get_kernels_cache().compile(updated_params, impl->get_kernels_source());
                    impl->set_kernels(kernels);
                    cache.add(updated_params, impl->clone());
                });
@ -395,7 +395,7 @@ bool primitive_inst::update_impl() {
            } else {
                _impl = _node->type()->choose_impl(*_node, updated_params);
                auto& kernels_cache = get_network().get_program()->get_kernels_cache();
-                auto kernels = kernels_cache.compile(_impl->get_kernels_source());
+                auto kernels = kernels_cache.compile(updated_params, _impl->get_kernels_source());
                _impl->set_kernels(kernels);
                cache.add(updated_params, _impl->clone());

@ -736,9 +736,9 @@ event::ptr primitive_inst::update_weights() {
            GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights from " << original_layout.to_short_string()
                                    << " to " << expected_layout.to_short_string() << std::endl;
            auto& kernels_cache = get_network().get_program()->get_kernels_cache();
-            auto kernels = kernels_cache.compile({weights_params.clKernel->code.kernelString});
+            auto kernels = kernels_cache.compile(*_impl_params, {weights_params.clKernel->code.kernelString});
            OPENVINO_ASSERT(kernels.size() == 1, "The output of kernel compile has issue");
-            kernel = kernels.begin()->second;
+            kernel = (kernels.begin()->second)[0];
            cache.add(kernel_key, kernel);
        }

@ -1096,7 +1096,7 @@ static primitive_id find_dep_by_mem(const cldnn::primitive_inst* p_inst, memory&
 //     [ intermediate memory information ]
 void primitive_inst::save(cldnn::BinaryOutputBuffer& ob) const {
    _impl_params->save(ob);
-    ob.setKernlImplParams(_impl_params.get());
+    ob.setKernelImplParams(_impl_params.get());

    ob << _node_output_layout;
    ob << has_mutable_input();
@ -1169,6 +1169,7 @@ void primitive_inst::save(cldnn::BinaryOutputBuffer& ob) const {

    if (_impl != nullptr) {
        ob << true;
+        _impl->set_cached_kernel_ids(_network.get_program()->get_kernels_cache());
        ob << _impl;
    } else {
        ob << false;
@ -1186,7 +1187,7 @@ int32_t primitive_inst::get_index_in_deps(memory::cptr arg) const {

 void primitive_inst::load(cldnn::BinaryInputBuffer& ib) {
    _impl_params->load(ib);
-    ib.setKernlImplParams(_impl_params.get());
+    ib.setKernelImplParams(_impl_params.get());

    ib >> _node_output_layout;
    ib >> _has_mutable_input;
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@ -229,14 +229,6 @@ std::shared_ptr<InferenceEngine::CPUStreamsExecutor> program::make_task_executor
    return std::make_shared<InferenceEngine::CPUStreamsExecutor>(task_executor_config);
 }

-kernel_id program::add_kernel(const std::shared_ptr<kernel_string>& kernelSring) {
-    return _kernels_cache->set_kernel_source(kernelSring, false);
-}
-
-kernel::ptr program::get_kernel(kernel_id id) {
-    return _kernels_cache->get_kernel(id);
-}
-
 kernels_cache& program::get_kernels_cache() const {
    return *_kernels_cache;
 }
@ -1640,10 +1632,6 @@ std::pair<int64_t, int64_t> program::get_estimated_device_mem_usage() {
    return std::make_pair(const_sum, get_engine().get_used_device_memory(allocation_type::usm_device));
 }

-void program::remove_kernel(kernel_id id) {
-    _kernels_cache->remove_kernel(id);
-}
-
 void program::cancel_compilation_context() {
    if (_compilation_context != nullptr)
        _compilation_context->cancel();
--- a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
+++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
@ -54,7 +54,6 @@ std::string reorder_options(const std::string& org_options) {
 }  // namespace

 namespace cldnn {
-std::atomic<size_t> kernels_cache::_kernel_idx{0};
 std::mutex kernels_cache::_mutex;

 std::string kernels_cache::get_cache_path() const {
@ -90,48 +89,54 @@ void kernels_cache::get_program_source(const kernels_code& kernels_source_code,
    OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::BuildAll::GetProgramSource");
    std::map<std::string, std::tuple<int32_t, std::vector<batch_program>>> program_buckets;

-    for (const auto& code : kernels_source_code) {
-        std::string full_code = code.kernel_strings->jit + code.kernel_strings->str + code.kernel_strings->undefs;
-        std::string entry_point = code.kernel_strings->entry_point;
-        std::string options = code.kernel_strings->options;
-        bool batch_compilation = code.kernel_strings->batch_compilation;
+    for (const auto& k : kernels_source_code) {
+        auto& code = k.second;
        bool dump_custom_program = code.dump_custom_program;

-        if (batch_compilation) {
-            options = reorder_options(options);
+        for (auto kernel_string : code.kernel_strings) {
+            std::string full_code = kernel_string->jit + kernel_string->str + kernel_string->undefs;
+            std::string entry_point = kernel_string->entry_point;
+            std::string options = kernel_string->options;
+            bool batch_compilation = kernel_string->batch_compilation;
+
+            if (batch_compilation) {
+                options = reorder_options(options);
+            }
+
+            std::string key = options;
+
+            if (batch_compilation == false) {
+                key += " __PROGRAM__" + std::to_string(program_buckets.size());
+            }
+
+            if (dump_custom_program) {
+                key += " __DUMP_CUSTOM_PROGRAM__";  // Adding label to key so it would be separated from other programs
+            }
+
+            auto& bucket_id = std::get<0>(program_buckets[key]);
+            auto& current_bucket = std::get<1>(program_buckets[key]);
+            if (current_bucket.empty()) { // new bucket
+                const auto& batch_id = 0;
+                // increase bucket id if and only if new bucket comes
+                bucket_id = static_cast<int32_t>(program_buckets.size() - 1);
+                current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_header_str));
+            }
+
+            // Create new kernels batch when the limit is reached
+            // and current kernel's entry_point is duplicated in this kernels batch
+            if (current_bucket.back().kernels_counter >= get_max_kernels_per_batch()
+                || current_bucket.back().entry_point_to_id.find(entry_point) != current_bucket.back().entry_point_to_id.end()) {
+                const auto& batch_id = static_cast<int32_t>(current_bucket.size());
+                current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_header_str));
+            }
+
+            auto& current_batch = current_bucket.back();
+            current_batch.dump_custom_program = dump_custom_program;
+            current_batch.entry_point_to_id.emplace(entry_point, code.params);
+
+            current_batch.source.push_back(std::move(full_code));
+            current_batch.kernels_counter++;
        }
-
-        std::string key = options;
-
-        if (batch_compilation == false) {
-            key += " __PROGRAM__" + std::to_string(program_buckets.size());
-        }
-
-        if (dump_custom_program) {
-            key += " __DUMP_CUSTOM_PROGRAM__";  // Adding label to key so it would be separated from other programs
-        }
-
-        auto& bucket_id = std::get<0>(program_buckets[key]);
-        auto& current_bucket = std::get<1>(program_buckets[key]);
-        if (current_bucket.empty()) { // new bucket
-            const auto& batch_id = 0;
-            // increase bucket id if and only if new bucket comes
-            bucket_id = static_cast<int32_t>(program_buckets.size() - 1);
-            current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_header_str));
-        }
-
-        // Create new kernels batch when the limit is reached
-        if (current_bucket.back().kernels_counter >= get_max_kernels_per_batch()) {
-            const auto& batch_id = static_cast<int32_t>(current_bucket.size());
-            current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_header_str));
-        }
-
-        auto& current_batch = current_bucket.back();
-        current_batch.dump_custom_program = dump_custom_program;
-        current_batch.entry_point_to_id[entry_point] = code.id;
-
-        current_batch.source.push_back(std::move(full_code));
-        current_batch.kernels_counter++;
    }

    // Compute hash value for each batch
@ -165,13 +170,6 @@ kernels_cache::kernels_cache(engine& engine,
    , _prog_id(prog_id)
    , batch_header_str(std::move(batch_header_str)) { }

-kernel_id kernels_cache::set_kernel_source(
-    const std::shared_ptr<kernel_string>& kernel_string,
-    bool dump_custom_program) {
-    auto kernel_ids = add_kernels_source({kernel_string}, dump_custom_program);
-    return kernel_ids[0];
-}
-
 static std::vector<unsigned char> getProgramBinaries(cl::Program program) {
    // Get the size of the program binary in bytes.
    std::vector<size_t> binary_sizes = program.getInfo<CL_PROGRAM_BINARY_SIZES>();
@ -189,7 +187,7 @@ static std::vector<unsigned char> getProgramBinaries(cl::Program program) {
 }

 // TODO: This build_batch method should be backend specific
-void kernels_cache::build_batch(const engine& build_engine, const batch_program& batch, std::map<const std::string, kernel::ptr>& compiled_kernels) {
+void kernels_cache::build_batch(const engine& build_engine, const batch_program& batch, compiled_kernels& compiled_kernels) {
    OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::build_batch");

    auto& cl_build_engine = dynamic_cast<const ocl::ocl_engine&>(build_engine);
@ -280,13 +278,17 @@ void kernels_cache::build_batch(const engine& build_engine, const batch_program&
            std::lock_guard<std::mutex> lock(_mutex);
            for (auto& k : kernels) {
                const auto& entry_point = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
-                const auto& k_id = batch.entry_point_to_id.find(entry_point);
-                if (k_id != batch.entry_point_to_id.end()) {
+                const auto& iter = batch.entry_point_to_id.find(entry_point);
+                if (iter != batch.entry_point_to_id.end()) {
                    cl_kernel kern = k.get();
                    cl_context context = cl_build_engine.get_cl_context().get();
                    kernel::ptr kernel = kernels_factory::create(_engine, context, kern, entry_point);
-                    const auto& kmap = std::make_pair(k_id->second, kernel);
-                    compiled_kernels.insert(kmap);
+                    auto& params = iter->second;
+                    if (compiled_kernels.find(params) != compiled_kernels.end()) {
+                        compiled_kernels[params].push_back(kernel);
+                    } else {
+                        compiled_kernels[params] = { kernel };
+                    }
                } else {
                    throw std::runtime_error("Could not find entry point");
                }
@ -328,14 +330,28 @@ void kernels_cache::build_batch(const engine& build_engine, const batch_program&
    }
 }

-kernel::ptr kernels_cache::get_kernel(kernel_id id) const {
-    if (_pending_compilation)
-        throw std::runtime_error("Kernel cache is not compiled, call build_all() first!");
+kernel::ptr kernels_cache::get_kernel_from_cached_kernels(std::string id) const {
+    auto res = _cached_kernels.find(id);
+    OPENVINO_ASSERT(_cached_kernels.end() != res, "[GPU] Kernel " + id + " not found in the cached kernel cache!");
+    return res->second->clone();
+}

-    auto res = _kernels.find(id);
-    if (_kernels.end() == res)
-        throw std::runtime_error("Kernel " + id + " not found in the kernel cache!");
-    return res->second;
+std::vector<kernel::ptr> kernels_cache::get_kernels(kernel_impl_params params) const {
+    OPENVINO_ASSERT((_pending_compilation == false), "Kernel cache is not compiled, call build_all() first!");
+
+    std::string current_node_id;
+    if (params.desc) {
+        current_node_id = params.desc->id;
+    }
+    auto res = _kernels.find(params);
+    OPENVINO_ASSERT(_kernels.end() != res, "Kernel for {" + current_node_id + "} is not found in the kernel cache!");
+
+    std::vector<kernel::ptr> kernels;
+    kernels.reserve(res->second.size());
+    for (auto& k : res->second) {
+        kernels.emplace_back(k->clone());
+    }
+    return kernels;
 }

 bool kernels_cache::validate_simple_kernel_execution(kernel::ptr krl) {
@ -430,117 +446,110 @@ void kernels_cache::reset() {
    _pending_compilation = false;
 }

-std::vector<kernel_id> kernels_cache::add_kernels_source(std::vector<std::shared_ptr<kernel_string>> kernel_sources, bool dump_custom_program) {
-    std::vector<kernel_id> kernel_ids;
-    kernel_ids.reserve(kernel_sources.size());
-    for (size_t i = 0; i < kernel_sources.size(); ++i) {
-        std::lock_guard<std::mutex> lock(_mutex);
-        auto kernel_string = kernel_sources[i];
-        kernel_id id = gen_kernel_id(kernel_string->entry_point);
-        auto res = _kernels_code.emplace(kernel_string, id, dump_custom_program);
+void kernels_cache::add_kernels_source(const kernel_impl_params& params,
+                                        const std::vector<std::shared_ptr<kernel_string>>& kernel_sources,
+                                        bool dump_custom_program) {
+    std::lock_guard<std::mutex> lock(_mutex);

-        assert(_kernels.find(id) == _kernels.end());
+    if (!kernel_sources.empty() && (_kernels_code.find(params) == _kernels_code.end())) {
+        auto res = _kernels_code.insert({params, {kernel_sources, params, dump_custom_program}});
+
+        assert(_kernels.find(params) == _kernels.end());
        if (res.second) {
            _pending_compilation = true;
        }
-        kernel_ids.emplace_back(id);
    }
+}
+
+std::string kernels_cache::get_cached_kernel_id(kernel::ptr kernel) const {
+    auto ocl_kernel = std::static_pointer_cast<cldnn::ocl::ocl_kernel>(kernel);
+    const auto& entry_point = ocl_kernel->get_handle().getInfo<CL_KERNEL_FUNCTION_NAME>();
+    auto program = ocl_kernel->get_handle().getInfo<CL_KERNEL_PROGRAM>();
+    cl::vector<unsigned char> program_binaries = getProgramBinaries(program);
+
+    auto iter = _cached_binaries.find(program_binaries);
+    OPENVINO_ASSERT(iter != _cached_binaries.end(), "[GPU] Not found cached kernel binaries");
+
+    return entry_point + "@" + std::to_string(iter->second);
+}
+
+std::vector<std::string> kernels_cache::get_cached_kernel_ids(const std::vector<kernel::ptr>& kernels) const {
+    std::vector<std::string> kernel_ids;
+
+    for (auto& kernel : kernels) {
+        auto key = get_cached_kernel_id(kernel);
+        kernel_ids.emplace_back(key);
+    }
+
    return kernel_ids;
 }

-void kernels_cache::add_kernels(const std::vector<std::string>& kernel_ids, const std::vector<kernel::ptr>& kernels) {
-    OPENVINO_ASSERT(kernel_ids.size() == kernels.size(), "[GPU] The sizes of kernel_ids and kernels are different.");
+void kernels_cache::add_to_cached_kernels(const std::vector<kernel::ptr>& kernels) {
+    static std::atomic<uint32_t> id_gen{0};

-    for (size_t i = 0; i < kernel_ids.size(); i++) {
-        const auto& kmap = std::make_pair(kernel_ids[i], kernels[i]);
-        _kernels.insert(kmap);
-        _kernel_idx++;
+    for (auto& kernel : kernels) {
+        auto ocl_kernel = std::static_pointer_cast<cldnn::ocl::ocl_kernel>(kernel);
+        auto program = ocl_kernel->get_handle().getInfo<CL_KERNEL_PROGRAM>();
+        cl::vector<unsigned char> program_binaries = getProgramBinaries(program);
+
+        std::lock_guard<std::mutex> lock(_mutex);
+        auto iter = _cached_binaries.find(program_binaries);
+        if (iter == _cached_binaries.end()) {
+            _cached_binaries[program_binaries] = id_gen++;
+        }
+        auto key = get_cached_kernel_id(kernel);
+
+        if (_cached_kernels.find(key) == _cached_kernels.end()) {
+            _cached_kernels[key] = kernel;
+        }
    }
 }

 void kernels_cache::save(BinaryOutputBuffer& ob) const {
    OPENVINO_ASSERT(_engine.type() == engine_types::ocl, "[GPU] Not supported engine type");

-    std::map<std::string, std::string> entry_point_to_id;
-    for (auto iter = _kernels.begin(); iter != _kernels.end(); iter++) {
-        std::string k_id = iter->first;
-        kernel::ptr kernel = iter->second;
-
-        auto ocl_kernel = std::static_pointer_cast<cldnn::ocl::ocl_kernel>(kernel);
-        const auto& entry_point = ocl_kernel->get_handle().getInfo<CL_KERNEL_FUNCTION_NAME>();
-
-        entry_point_to_id[entry_point] = k_id;
+    ob << _cached_binaries.size();
+    for (auto& cached_binary : _cached_binaries) {
+        ob << cached_binary.second;
+        ob << cached_binary.first;
    }
-    ob << entry_point_to_id;
-
-    std::unique_ptr<ocl::ocl_engine> build_engine = cldnn::make_unique<ocl::ocl_engine>(_engine.get_device(), runtime_types::ocl);
-
-    std::vector<std::vector<unsigned char>> precompiled_kernels;
-
-    for (auto iter = _kernels.begin(); iter != _kernels.end(); iter++) {
-        kernel::ptr kernel = iter->second;
-        auto ocl_kernel = std::static_pointer_cast<cldnn::ocl::ocl_kernel>(kernel);
-        auto program = ocl_kernel->get_handle().getInfo<CL_KERNEL_PROGRAM>();
-        const auto& entry_point = ocl_kernel->get_handle().getInfo<CL_KERNEL_FUNCTION_NAME>();
-        const auto& k_id = entry_point_to_id.find(entry_point);
-
-        if (k_id != entry_point_to_id.end()) {
-            cl::Program::Binaries binary_kernels = {getProgramBinaries(program)};
-
-            try {
-                cl::vector<cl::Kernel> kernels;
-                cl::Program programs(build_engine->get_cl_context(), {build_engine->get_cl_device()}, binary_kernels);
-                programs.build({build_engine->get_cl_device()});
-                programs.createKernels(&kernels);
-
-                for (auto& k : kernels) {
-                    const auto& entry_point = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
-                    entry_point_to_id.erase(entry_point);
-                }
-
-                precompiled_kernels.push_back(std::move(binary_kernels[0]));
-            } catch (const cl::BuildError& err) {
-                std::string err_log = "";
-                for (auto& p : err.getBuildLog()) {
-                    err_log += p.second + '\n';
-                }
-                IE_THROW() << err_log;
-            }
-        }
-    }
-    ob << precompiled_kernels;
 }

 void kernels_cache::load(BinaryInputBuffer& ib) {
    OPENVINO_ASSERT(_engine.type() == engine_types::ocl, "[GPU] Not supported engine type");

+    std::unordered_map<uint32_t, std::vector<unsigned char>> precompiled_kernels;
+
+    size_t num_cached_binaries;
+    ib >> num_cached_binaries;
+    for (size_t i = 0; i < num_cached_binaries; ++i) {
+        uint32_t id;
+        ib >> id;
+        ib >> precompiled_kernels[id];
+    }
+
    std::unique_ptr<ocl::ocl_engine> build_engine =
        cldnn::make_unique<ocl::ocl_engine>(_engine.get_device(), runtime_types::ocl);

-    std::map<std::string, std::string> entry_point_to_id;
-    std::vector<std::vector<unsigned char>> precompiled_kernels;
-    ib >> entry_point_to_id;
-    ib >> precompiled_kernels;
-
    try {
        std::lock_guard<std::mutex> lock(_mutex);
-        _kernels.clear();
+        _cached_kernels.clear();

-        for (auto& binary_kernels : precompiled_kernels) {
+        for (auto& precompiled_kernel : precompiled_kernels) {
            cl::vector<cl::Kernel> kernels;
-            cl::Program program(build_engine->get_cl_context(), {build_engine->get_cl_device()}, {binary_kernels});
+            cl::Program program(build_engine->get_cl_context(), {build_engine->get_cl_device()}, {precompiled_kernel.second});
            program.build({build_engine->get_cl_device()});
            program.createKernels(&kernels);

            for (auto& k : kernels) {
                const auto& entry_point = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
-                const auto& k_id = entry_point_to_id.find(entry_point);
-                if (k_id != entry_point_to_id.end()) {
+                std::string cached_kernel_id = entry_point + "@" + std::to_string(precompiled_kernel.first);
+                const auto& iter = _cached_kernels.find(cached_kernel_id);
+                if (iter == _cached_kernels.end()) {
                    cl_kernel cl_kernel = k.get();
                    cl_context cl_context = build_engine->get_cl_context().get();
                    kernel::ptr kernel = kernels_factory::create(_engine, cl_context, cl_kernel, entry_point);
-                    _kernels.insert({k_id->second, kernel});
-                    _kernel_idx++;
+                    _cached_kernels[cached_kernel_id] = kernel;
                }
            }
        }
@ -553,16 +562,15 @@ void kernels_cache::load(BinaryInputBuffer& ib) {
    }
 }

-std::map<const std::string, kernel::ptr> kernels_cache::compile(std::vector<std::shared_ptr<kernel_string>> kernel_sources,
-                                                                                        bool dump_custom_program) {
+kernels_cache::compiled_kernels kernels_cache::compile(const kernel_impl_params& params,
+                                            const std::vector<std::shared_ptr<kernel_string>>& kernel_sources,
+                                            bool dump_custom_program) {
    OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::Compile_ThreadSafe");
    kernels_code t_kernels_code;

    // Get kernels code from kernel sources
-    for (size_t idx = 0; idx < kernel_sources.size(); ++idx) {
-        auto kernel_string = kernel_sources[idx];
-        kernel_id id = gen_kernel_id(kernel_string->entry_point);
-        t_kernels_code.emplace(kernel_string, id, dump_custom_program);
+    for (size_t k = 0; k < kernel_sources.size(); ++k) {
+        t_kernels_code.insert({params, {kernel_sources, params, dump_custom_program}});
    }

    ocl::ocl_engine& _build_engine = downcast<ocl::ocl_engine>(_engine);
@ -571,7 +579,7 @@ std::map<const std::string, kernel::ptr> kernels_cache::compile(std::vector<std:
    std::vector<batch_program> batches;
    get_program_source(t_kernels_code, &batches);

-    std::map<const std::string, kernel::ptr> output_kernels;
+    compiled_kernels output_kernels;
    // Build batches
    for (size_t idx = 0; idx < batches.size(); ++idx) {
        build_batch(_build_engine, batches[idx], output_kernels);
--- a/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp
+++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp
@ -8,6 +8,7 @@
 #include "intel_gpu/runtime/engine.hpp"
 #include "intel_gpu/runtime/kernel.hpp"
 #include "intel_gpu/runtime/execution_config.hpp"
+#include "intel_gpu/graph/kernel_impl_params.hpp"

 #include <map>
 #include <mutex>
@ -25,6 +26,27 @@ namespace cldnn {

 class kernels_cache {
 public:
+    struct kernel_code {
+        std::vector<std::shared_ptr<kernel_string>> kernel_strings;
+        kernel_impl_params params;
+        bool dump_custom_program;
+
+        kernel_code(const std::vector<std::shared_ptr<kernel_string>>& _kernel_strings,
+                    const kernel_impl_params& _params,
+                    bool _dump_custom_program)
+            : kernel_strings(_kernel_strings),
+                params(_params),
+                dump_custom_program(_dump_custom_program) {}
+    };
+
+    struct impl_hasher {
+        size_t operator()(const kernel_impl_params &k) const {
+            return k.hash();
+        }
+    };
+
+    using kernels_code = std::unordered_map<kernel_impl_params, kernel_code, impl_hasher>;
+
    using source_code = std::vector<std::string>;
    struct batch_program {
        int32_t bucket_id;
@ -34,7 +56,7 @@ public:
        source_code source;
        std::string options;
        bool dump_custom_program;
-        std::map<std::string, std::string> entry_point_to_id;
+        std::map<std::string, kernel_impl_params> entry_point_to_id;

        explicit batch_program(int32_t _bucket_id, int32_t _batch_id, std::string _options, const std::vector<std::string>& batch_header_str)
            : bucket_id(_bucket_id),
@ -48,32 +70,7 @@ public:
        }
    };

-    struct kernel_code {
-        std::shared_ptr<kernel_string> kernel_strings;
-        std::string id;
-        bool dump_custom_program;
-        size_t hash_value;
-
-        kernel_code(const std::shared_ptr<kernel_string>& _kernel_strings,
-                    const std::string& _id,
-                    bool _dump_custom_program)
-            : kernel_strings(_kernel_strings),
-              id(_id),
-              dump_custom_program(_dump_custom_program),
-              hash_value(_kernel_strings->get_hash()) {}
-
-        bool operator == (const kernel_code& rhs) const {
-            return (hash_value == rhs.hash_value);
-        }
-    };
-
-    struct cmp_kernel_code {
-        bool operator()(const kernel_code& x1, const kernel_code& x2) const {
-            return (x1.hash_value < x2.hash_value);
-        }
-    };
-
-    using kernels_code = std::set<kernel_code, cmp_kernel_code>;
+    using compiled_kernels = std::unordered_map<kernel_impl_params, std::vector<kernel::ptr>, impl_hasher>;

 private:
    static std::mutex _mutex;
@ -82,32 +79,27 @@ private:
    ExecutionConfig _config;
    uint32_t _prog_id = 0;
    kernels_code _kernels_code;
-    static std::atomic<size_t> _kernel_idx;
    std::atomic<bool> _pending_compilation{false};
-    std::map<const std::string, kernel::ptr> _kernels;
+    compiled_kernels _kernels;
+    std::map<std::vector<unsigned char>, uint32_t> _cached_binaries;
+    std::unordered_map<std::string, kernel::ptr> _cached_kernels;
    std::vector<std::string> batch_header_str;

    void get_program_source(const kernels_code& kernels_source_code, std::vector<batch_program>*) const;
-    void build_batch(const engine& build_engine, const batch_program& batch, std::map<const std::string, kernel::ptr>& compiled_kernels);
+    void build_batch(const engine& build_engine, const batch_program& batch, compiled_kernels& compiled_kernels);

    std::string get_cache_path() const;
    bool is_cache_enabled() const;
    size_t get_max_kernels_per_batch() const;

-    inline std::string gen_kernel_id(std::string entry_point) {
-        // we need unique id in order to avoid conflict across topologies.
-        return entry_point + "_" + std::to_string((_kernel_idx++));
-    }
-
 public:
    explicit kernels_cache(engine& engine,
                           const ExecutionConfig& config,
                           uint32_t prog_id,
                           InferenceEngine::CPUStreamsExecutor::Ptr task_executor = nullptr,
                           const std::vector<std::string>& batch_header_str = {});
-    kernel_id set_kernel_source(const std::shared_ptr<kernel_string>& kernel_string,
-                                bool dump_custom_program);
-    kernel::ptr get_kernel(kernel_id id) const;
+    kernel::ptr get_kernel_from_cached_kernels(std::string id) const;
+    std::vector<kernel::ptr> get_kernels(kernel_impl_params params) const;
    void set_batch_header_str(const std::vector<std::string> &batch_headers) {
        batch_header_str = std::move(batch_headers);
    }
@ -117,14 +109,20 @@ public:
    // forces compilation of all pending kernels/programs
    void build_all();
    void reset();
-    void remove_kernel(kernel_id id) {
-        _kernels.erase(id);
-    }
-    std::vector<kernel_id> add_kernels_source(std::vector<std::shared_ptr<kernel_string>> kernel_sources, bool dump_custom_program = false);
-    void add_kernels(const std::vector<std::string>& kernel_ids, const std::vector<kernel::ptr>& kernels);
+
+    void add_kernels_source(const kernel_impl_params& params,
+                                const std::vector<std::shared_ptr<kernel_string>>& kernel_sources,
+                                bool dump_custom_program = false);
+    compiled_kernels compile(const kernel_impl_params& params,
+                                const std::vector<std::shared_ptr<kernel_string>>& kernel_sources,
+                                bool dump_custom_program = false);
+
+    std::string get_cached_kernel_id(kernel::ptr kernel) const;
+    std::vector<std::string> get_cached_kernel_ids(const std::vector<kernel::ptr>& kernels) const;
+    void add_to_cached_kernels(const std::vector<kernel::ptr>& kernels);
+
    void save(BinaryOutputBuffer& ob) const;
    void load(BinaryInputBuffer& ib);
-    std::map<const std::string, kernel::ptr> compile(std::vector<std::shared_ptr<kernel_string>> kernel_sources, bool dump_custom_program = false);
 };

 }  // namespace cldnn
--- a/src/plugins/intel_gpu/tests/passes/kernels_cache_test.cpp
+++ b/src/plugins/intel_gpu/tests/passes/kernels_cache_test.cpp
@ -0,0 +1,92 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_utils.h"
+
+#include "intel_gpu/runtime/engine.hpp"
+
+#include "intel_gpu/graph/program.hpp"
+#include "data_inst.h"
+#include "eltwise_inst.h"
+#include "reshape_inst.h"
+#include "shape_of_inst.h"
+#include "fully_connected_inst.h"
+#include "permute_inst.h"
+#include "reduce_inst.h"
+#include "intel_gpu/graph/network.hpp"
+#include "pass_manager.h"
+#include "to_string_utils.h"
+
+#include "program_wrapper.h"
+
+#include <memory>
+
+using namespace cldnn;
+using namespace ::tests;
+
+TEST(kernels_cache, reuse_kernel_for_static_model_01) {
+    auto& engine = get_test_engine();
+
+    auto input0 = engine.allocate_memory({{1, 1, 4, 5}, data_types::f16, format::bfyx});
+    auto input1 = engine.allocate_memory({{1, 1, 4, 5}, data_types::f16, format::bfyx});
+    auto input2 = engine.allocate_memory({{1, 1, 4, 5}, data_types::f16, format::bfyx});
+    auto input3 = engine.allocate_memory({{1, 1, 4, 5}, data_types::f16, format::bfyx});
+    auto input4 = engine.allocate_memory({{1, 1, 4, 5}, data_types::f16, format::bfyx});
+    auto input5 = engine.allocate_memory({{1, 1, 4, 5}, data_types::f16, format::bfyx});
+    auto weights1 = engine.allocate_memory({{1, 3, 2, 3 }, data_types::f16, format::bfyx});
+    auto weights2 = engine.allocate_memory({{1, 3, 2, 3 }, data_types::f16, format::bfyx});
+
+    topology topology(input_layout("input0", input0->get_layout()),
+                      input_layout("input1", input1->get_layout()),
+                      input_layout("input2", input2->get_layout()),
+                      input_layout("input3", input3->get_layout()),
+                      input_layout("input4", input4->get_layout()),
+                      input_layout("input5", input5->get_layout()),
+                      data("weights1", weights1),
+                      data("weights2", weights2),
+                      concatenation("concat1",
+                                    { input_info("input0"), input_info("input1"), input_info("input2") },
+                                    1,
+                                    data_types::f16,
+                                    padding{{0, 0, 0, 0}, 0}),
+                      convolution("conv1", input_info("concat1"), { "weights1" }, { 1, 1 }),
+                      concatenation("concat2",
+                                    { input_info("input3"), input_info("input4"), input_info("input5") },
+                                    1,
+                                    data_types::f16,
+                                    padding{{0, 0, 0, 0}, 0}),
+                      convolution("conv2", input_info("concat2"), { "weights2" }, { 1, 1 }),
+                      eltwise("sum", {input_info("concat1"), input_info("concat2")}, eltwise_mode::sum),
+                      reorder("output", input_info("sum"), {{3, 2}, data_types::f16, format::bfyx}));
+
+    ExecutionConfig config;
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    auto prog = program::build_program(engine, topology, config, false, false);
+    auto& cache = prog->get_kernels_cache();
+    auto& conv1_node = prog->get_node("conv1");
+    auto& conv2_node = prog->get_node("conv2");
+    auto conv1_kernels = conv1_node.get_selected_impl()->get_kernels();
+    cache.add_to_cached_kernels(conv1_kernels);
+    auto conv2_kernels = conv2_node.get_selected_impl()->get_kernels();
+    cache.add_to_cached_kernels(conv2_kernels);
+    ASSERT_EQ(conv1_kernels.size(), conv2_kernels.size());
+    for (size_t idx = 0; idx < conv1_kernels.size(); idx++) {
+        auto conv1_kern = cache.get_cached_kernel_id(conv1_kernels[idx]);
+        auto conv2_kern = cache.get_cached_kernel_id(conv2_kernels[idx]);
+        ASSERT_EQ(conv1_kern, conv2_kern);
+    }
+
+    auto& concat1_node = prog->get_node("concat1");
+    auto& concat2_node = prog->get_node("concat2");
+    auto concat1_kernels = concat1_node.get_selected_impl()->get_kernels();
+    cache.add_to_cached_kernels(concat1_kernels);
+    auto concat2_kernels = concat2_node.get_selected_impl()->get_kernels();
+    cache.add_to_cached_kernels(concat2_kernels);
+    ASSERT_EQ(concat1_kernels.size(), concat2_kernels.size());
+    for (size_t idx = 0; idx < concat1_kernels.size(); idx++) {
+        auto concat1_kern = cache.get_cached_kernel_id(concat1_kernels[idx]);
+        auto concat2_kern = cache.get_cached_kernel_id(concat2_kernels[idx]);
+        ASSERT_EQ(concat1_kern, concat2_kern);
+    }
+}