[GPU] implement lru_cache(#12349) (#12349)

* Fix memory leak issue Co-authored-by: Taylor Yeonbok Lee <taylor.lee@intel.com> Co-authored-by: Taylor Yeonbok Lee <taylor.lee@intel.com>
2022-08-03 15:25:44 +09:00 · 2022-08-03 15:25:44 +09:00 · 03b0199521
commit 03b0199521
parent b449481439
9 changed files with 313 additions and 1 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
@ -247,6 +247,8 @@ public:
    // returns {-1, -1} if it failed to estimate by allocating given batch size
    std::pair<int64_t/*const alloc*/, int64_t/*general alloc*/> get_estimated_device_mem_usage();

+    void remove_kernel(kernel_id id);
+
 private:
    uint32_t prog_id = 0;
    engine& _engine;
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/lru_cache.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/lru_cache.hpp
@ -0,0 +1,167 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#pragma once
+
+#include <list>
+#include <unordered_map>
+#include <functional>
+#include <iostream>
+
+namespace cldnn {
+
+struct primitive_impl;
+
+/// @brief LRU cache which remove the least recently used data when cache is full.
+template<typename Key, typename Value>
+class LruCache {
+public:
+    using data_type = std::pair<Key, Value>;
+
+
+public:
+    explicit LruCache(size_t caps) : _capacity(caps) {}
+
+    ~LruCache() {
+        clear();
+    }
+
+    /**
+     * @brief Get the least recently used element object in the cache
+     *
+     * @return Value
+     */
+    Value get_lru_element() const {
+        if (_lru_data_list.size()) {
+            return _lru_data_list.back().second;
+        } else {
+            return Value();
+        }
+    }
+
+    /**
+     * @brief Add new value with associated key into the LRU cache
+     *
+     * @param key if same key is existed in the cache, the value of key is updated new entry.
+     * @param value
+     * @return true, if cache is full and lease recently used entry are removed to add new entry.
+     * @return false Otherwise
+     */
+    bool add(const Key& key, const Value& value) {
+        auto map_iter = _key_map.find(key);
+        if (map_iter != _key_map.end()) {
+            touch_data(map_iter->second);
+            map_iter->second->second = value;
+            return false;
+        }
+
+        bool popped_last_element = false;
+        if (_capacity > 0 && _capacity == _key_map.size()) {
+            pop();
+            popped_last_element = true;
+        }
+        auto iter = _lru_data_list.insert(_lru_data_list.begin(), {key, value});
+        _key_map.insert({key, iter});
+        return popped_last_element;
+    }
+
+    /**
+     * @brief Check whether the value assocaited with key is existed in the cache
+     *
+     * @param key
+     * @return true if any value associated with the key is existed.
+     * @return false otherwise
+     */
+    bool has(const Key& key) const {
+        return (_key_map.find(key) != _key_map.end());
+    }
+
+    /**
+     * @brief Find a value associated with a key
+     *
+     * @param key
+     * @return Value a value associated with input key. if the key is not existed in the cache, return nullptr
+     */
+    Value get(const Key& key) {
+        auto iter = _key_map.find(key);
+        if (iter == _key_map.end()) {
+            return Value();
+        }
+        touch_data(iter->second);
+        return _lru_data_list.front().second;
+    }
+
+    /**
+     * @brief Remove all entries
+     *
+     */
+    void clear() {
+        _lru_data_list.clear();
+        _key_map.clear();
+    }
+
+    /**
+     * @brief Return current size of cache
+     *
+     * @return size_t
+     */
+    size_t size() const {
+        return _lru_data_list.size();
+    }
+
+    /**
+     * @brief Return capacity of the cache
+     *
+     * @return size_t
+     */
+    size_t capacity() const {
+        return _capacity;
+    }
+
+    /**
+     * @brief Get the all keys object
+     *
+     * @return std::vector<Key>
+     */
+    std::vector<Key> get_all_keys() const {
+        std::vector<Key> key_list;
+        for (auto& iter : _lru_data_list) {
+            key_list.push_back(iter.first);
+        }
+        return key_list;
+    }
+
+private:
+    using lru_data_list_type = std::list<data_type>;
+    using lru_data_list_iter = typename lru_data_list_type::iterator;
+
+    std::list<data_type> _lru_data_list;
+    std::unordered_map<Key, lru_data_list_iter> _key_map;
+    const size_t _capacity;
+
+    /**
+     * @brief Move data to front of list because the data is touched.
+     *
+     * @param iter iterator of current touched data
+     */
+    void touch_data(lru_data_list_iter iter) {
+        _lru_data_list.splice(_lru_data_list.begin(), _lru_data_list, iter);
+    }
+
+    /**
+     * @brief Pop n lease recently used cache data.
+     *
+     * @param n number of data to be popped
+     */
+    void pop(size_t n = 1) {
+        for (size_t i = 0; i < n && !_lru_data_list.empty(); ++i) {
+            _key_map.erase(_lru_data_list.back().first);
+            _lru_data_list.pop_back();
+        }
+    }
+};
+
+using ImplementationsCache = cldnn::LruCache<std::string, std::shared_ptr<primitive_impl>>;
+}  // namespace cldnn
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp
@ -73,6 +73,10 @@ struct custom_gpu_primitive_impl : typed_primitive_impl<custom_gpu_primitive> {
        args.outputs = { instance.output_memory_ptr() };
        return stream.enqueue_kernel(*_kernels.front(), cl_kernel.get()->params, args, events, instance.node.is_output());
    }
+
+    std::vector<std::string> get_kernel_ids() override {
+        return {_kernel_id};
+    }
 };

 static kernel_selector::kernel_argument_element get_arg(custom_gpu_primitive::arg_desc arg) {
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
@ -108,6 +108,10 @@ protected:
        }
    }

+    std::vector<std::string> get_kernel_ids() override {
+        return _kernel_ids;
+    }
+
    std::vector<layout> get_internal_buffer_layouts_impl() const override {
        if (_kernel_data.internalBufferSizes.empty())
            return {};
--- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
@ -49,6 +49,9 @@ struct primitive_impl {
    virtual bool is_cpu() const { return true; }
    virtual void init_kernels() = 0;
    virtual std::unique_ptr<primitive_impl> clone() const = 0;
+    virtual std::vector<std::string> get_kernel_ids() {
+        return {};
+    }

 protected:
    std::string _kernel_name;
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@ -1556,3 +1556,7 @@ std::pair<int64_t, int64_t> program::get_estimated_device_mem_usage() {

    return std::make_pair(const_sum, get_engine().get_used_device_memory(allocation_type::usm_device));
 }
+
+void program::remove_kernel(kernel_id id) {
+    _kernels_cache->remove_kernel(id);
+}
--- a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
+++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
@ -148,7 +148,7 @@ kernel_id kernels_cache::set_kernel_source(
    bool dump_custom_program) {
    std::lock_guard<std::mutex> lock(_mutex);
    // we need unique id in order to avoid conflict across topologies.
-    const auto kernel_num = _kernels.size() + _kernels_code.size();
+    const auto kernel_num = _kernels.size() + (_kernel_idx++);
    kernel_id id = kernel_string->entry_point + "_" + std::to_string(kernel_num);

    auto res = _kernels_code.emplace(kernel_string, id, dump_custom_program);
--- a/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp
+++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp
@ -75,6 +75,7 @@ private:
    engine& _engine;
    uint32_t _prog_id = 0;
    kernels_code _kernels_code;
+    size_t _kernel_idx = 0;
    std::atomic<bool> _pending_compilation{false};
    std::map<const std::string, kernel::ptr> _kernels;
    std::vector<std::string> batch_header_str;
@ -97,6 +98,9 @@ public:
    // forces compilation of all pending kernels/programs
    void build_all();
    void reset();
+    void remove_kernel(kernel_id id) {
+        _kernels.erase(id);
+    }
 };

 }  // namespace cldnn
--- a/src/plugins/intel_gpu/tests/test_cases/lru_caches_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/test_cases/lru_caches_gpu_test.cpp
@ -0,0 +1,124 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_utils.h"
+
+#include "intel_gpu/runtime/lru_cache.hpp"
+#include <vector>
+
+using namespace cldnn;
+using namespace ::tests;
+
+
+
+TEST(lru_cache, basic_data_type)
+{
+    const size_t cap = 4;
+    LruCache<int, int> ca(cap * sizeof(int));
+
+    std::vector<int> inputs = {1, 2, 3, 4, 2, 1, 5};
+    std::vector<std::pair<int, int>> input_values;
+    for (auto i :  inputs) {
+        input_values.push_back(std::make_pair(i, i + 10));
+    }
+
+    EXPECT_EQ(ca.get_lru_element(), int());
+
+    std::vector<bool> expected_hitted = {false, false, false, false, true, true, false};
+    for (size_t i = 0; i < input_values.size(); i++) {
+        auto& in = input_values[i];
+        int data = 0;
+        bool hitted = ca.has(in.first);
+        if (hitted) {
+            data = ca.get(in.first);
+        } else {
+            ca.add(in.first, in.second);
+            data = ca.get(in.first);
+        }
+        EXPECT_EQ(data, in.second);
+        EXPECT_EQ(hitted, (bool)expected_hitted[i]);
+    }
+
+    std::vector<std::pair<int, int>> expected_value;
+    for (size_t i = ca.size(); i > 0; i--) {  // 5, 1, 2, 4
+        int idx = input_values.size() - i;
+        expected_value.push_back(input_values[idx]);
+    }
+
+    int idx = expected_value.size() - 1;
+    for (auto key : ca.get_all_keys()) {
+        EXPECT_EQ(key, expected_value[idx--].first);
+    }
+}
+
+class lru_cache_test_data {
+public:
+    lru_cache_test_data(int a, int b, int c) : x(a), y(b), z(c) {
+        key = "key_" + std::to_string(a) + "_" + std::to_string(b) + "_" + std::to_string(c);
+    }
+
+    bool operator==(const lru_cache_test_data&rhs) {
+        return (this->x == rhs.x && this->y == rhs.y && this->z == rhs.z);
+    }
+
+    bool operator!=(const lru_cache_test_data&rhs) {
+        return (this->x != rhs.x || this->y != rhs.y || this->z != rhs.z);
+    }
+
+    operator std::string() {
+        return "(" + std::to_string(x) + "," + std::to_string(y) + "," + std::to_string(z) + ")";
+    }
+
+    std::string key;
+    int x;
+    int y;
+    int z;
+
+};
+
+using test_impl_cache = LruCache<std::string, std::shared_ptr<lru_cache_test_data>>;
+
+TEST(lru_cache, custom_data_type) {
+    const size_t cap = 4;
+    test_impl_cache ca(cap);
+
+    std::vector<std::shared_ptr<lru_cache_test_data>> inputs;
+    inputs.push_back(std::make_shared<lru_cache_test_data>(1, 21, 11));
+    inputs.push_back(std::make_shared<lru_cache_test_data>(2, 22, 12));
+    inputs.push_back(std::make_shared<lru_cache_test_data>(3, 23, 13));
+    inputs.push_back(std::make_shared<lru_cache_test_data>(4, 24, 14));
+    inputs.push_back(std::make_shared<lru_cache_test_data>(2, 22, 12));
+    inputs.push_back(std::make_shared<lru_cache_test_data>(1, 21, 11));
+    inputs.push_back(std::make_shared<lru_cache_test_data>(3, 23, 13));
+    inputs.push_back(std::make_shared<lru_cache_test_data>(5, 25, 15));
+
+    std::vector<bool> expected_hitted = {false, false, false, false, true, true, true, false};
+
+    EXPECT_EQ(ca.get_lru_element(), std::shared_ptr<lru_cache_test_data>());
+    for (size_t i = 0; i < inputs.size(); i++) {
+        auto& in = inputs[i];
+        std::shared_ptr<lru_cache_test_data> p_data;
+        bool hitted = ca.has(in->key);
+        if (hitted) {
+            p_data = ca.get(in->key);
+        } else {
+            ca.add(in->key, in);
+            p_data = ca.get(in->key);
+        }
+        EXPECT_EQ(p_data->key, in->key);
+        EXPECT_EQ(hitted, (bool)expected_hitted[i]);
+    }
+
+    EXPECT_EQ(cap, ca.size());
+
+    std::vector<std::string> expected_keys;
+    for (size_t i = cap; i > 0; i--) {
+        expected_keys.push_back(inputs[inputs.size() - i]->key);
+    }
+
+    int idx = expected_keys.size() - 1;
+    for (auto key : ca.get_all_keys()) {
+        EXPECT_EQ(key, expected_keys[idx--]);
+    }
+}