[GPU] implement lru_cache(#12349) (#12349)

* Fix memory leak issue Co-authored-by: Taylor Yeonbok Lee <taylor.lee@intel.com> Co-authored-by: Taylor Yeonbok Lee <taylor.lee@intel.com>
2022-08-03 15:25:44 +09:00 · 2022-08-03 15:25:44 +09:00 · 03b0199521
commit 03b0199521
parent b449481439
9 changed files with 313 additions and 1 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
@ -247,6 +247,8 @@ public:
    // returns {-1, -1} if it failed to estimate by allocating given batch size
    std::pair<int64_t/*const alloc*/, int64_t/*general alloc*/> get_estimated_device_mem_usage();
    void remove_kernel(kernel_id id);
 private:
    uint32_t prog_id = 0;
    engine& _engine;
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/lru_cache.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/lru_cache.hpp
@ -0,0 +1,167 @@
 // Copyright (C) 2018-2022 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 #pragma once
 #include <list>
 #include <unordered_map>
 #include <functional>
 #include <iostream>
 namespace cldnn {
 struct primitive_impl;
 /// @brief LRU cache which remove the least recently used data when cache is full.
 template<typename Key, typename Value>
 class LruCache {
 public:
    using data_type = std::pair<Key, Value>;
 public:
    explicit LruCache(size_t caps) : _capacity(caps) {}
    ~LruCache() {
        clear();
    }
    /**
     * @brief Get the least recently used element object in the cache
     *
     * @return Value
     */
    Value get_lru_element() const {
        if (_lru_data_list.size()) {
            return _lru_data_list.back().second;
        } else {
            return Value();
        }
    }
    /**
     * @brief Add new value with associated key into the LRU cache
     *
     * @param key if same key is existed in the cache, the value of key is updated new entry.
     * @param value
     * @return true, if cache is full and lease recently used entry are removed to add new entry.
     * @return false Otherwise
     */
    bool add(const Key& key, const Value& value) {
        auto map_iter = _key_map.find(key);
        if (map_iter != _key_map.end()) {
            touch_data(map_iter->second);
            map_iter->second->second = value;
            return false;
        }
        bool popped_last_element = false;
        if (_capacity > 0 && _capacity == _key_map.size()) {
            pop();
            popped_last_element = true;
        }
        auto iter = _lru_data_list.insert(_lru_data_list.begin(), {key, value});
        _key_map.insert({key, iter});
        return popped_last_element;
    }
    /**
     * @brief Check whether the value assocaited with key is existed in the cache
     *
     * @param key
     * @return true if any value associated with the key is existed.
     * @return false otherwise
     */
    bool has(const Key& key) const {
        return (_key_map.find(key) != _key_map.end());
    }
    /**
     * @brief Find a value associated with a key
     *
     * @param key
     * @return Value a value associated with input key. if the key is not existed in the cache, return nullptr
     */
    Value get(const Key& key) {
        auto iter = _key_map.find(key);
        if (iter == _key_map.end()) {
            return Value();
        }
        touch_data(iter->second);
        return _lru_data_list.front().second;
    }
    /**
     * @brief Remove all entries
     *
     */
    void clear() {
        _lru_data_list.clear();
        _key_map.clear();
    }
    /**
     * @brief Return current size of cache
     *
     * @return size_t
     */
    size_t size() const {
        return _lru_data_list.size();
    }
    /**
     * @brief Return capacity of the cache
     *
     * @return size_t
     */
    size_t capacity() const {
        return _capacity;
    }
    /**
     * @brief Get the all keys object
     *
     * @return std::vector<Key>
     */
    std::vector<Key> get_all_keys() const {
        std::vector<Key> key_list;
        for (auto& iter : _lru_data_list) {
            key_list.push_back(iter.first);
        }
        return key_list;
    }
 private:
    using lru_data_list_type = std::list<data_type>;
    using lru_data_list_iter = typename lru_data_list_type::iterator;
    std::list<data_type> _lru_data_list;
    std::unordered_map<Key, lru_data_list_iter> _key_map;
    const size_t _capacity;
    /**
     * @brief Move data to front of list because the data is touched.
     *
     * @param iter iterator of current touched data
     */
    void touch_data(lru_data_list_iter iter) {
        _lru_data_list.splice(_lru_data_list.begin(), _lru_data_list, iter);
    }
    /**
     * @brief Pop n lease recently used cache data.
     *
     * @param n number of data to be popped
     */
    void pop(size_t n = 1) {
        for (size_t i = 0; i < n && !_lru_data_list.empty(); ++i) {
            _key_map.erase(_lru_data_list.back().first);
            _lru_data_list.pop_back();
        }
    }
 };
 using ImplementationsCache = cldnn::LruCache<std::string, std::shared_ptr<primitive_impl>>;
 }  // namespace cldnn
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/custom_primitive.cpp
@ -73,6 +73,10 @@ struct custom_gpu_primitive_impl : typed_primitive_impl<custom_gpu_primitive> {
        args.outputs = { instance.output_memory_ptr() };
        return stream.enqueue_kernel(*_kernels.front(), cl_kernel.get()->params, args, events, instance.node.is_output());
    }
    std::vector<std::string> get_kernel_ids() override {
        return {_kernel_id};
    }
 };
 static kernel_selector::kernel_argument_element get_arg(custom_gpu_primitive::arg_desc arg) {
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
@ -108,6 +108,10 @@ protected:
        }
    }
    std::vector<std::string> get_kernel_ids() override {
        return _kernel_ids;
    }
    std::vector<layout> get_internal_buffer_layouts_impl() const override {
        if (_kernel_data.internalBufferSizes.empty())
            return {};
--- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
@ -49,6 +49,9 @@ struct primitive_impl {
    virtual bool is_cpu() const { return true; }
    virtual void init_kernels() = 0;
    virtual std::unique_ptr<primitive_impl> clone() const = 0;
    virtual std::vector<std::string> get_kernel_ids() {
        return {};
    }
 protected:
    std::string _kernel_name;
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@ -1556,3 +1556,7 @@ std::pair<int64_t, int64_t> program::get_estimated_device_mem_usage() {
    return std::make_pair(const_sum, get_engine().get_used_device_memory(allocation_type::usm_device));
 }
 void program::remove_kernel(kernel_id id) {
    _kernels_cache->remove_kernel(id);
 }
--- a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
+++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
@ -148,7 +148,7 @@ kernel_id kernels_cache::set_kernel_source(
    bool dump_custom_program) {
    std::lock_guard<std::mutex> lock(_mutex);
    // we need unique id in order to avoid conflict across topologies.
-    const auto kernel_num = _kernels.size() + _kernels_code.size();
+    const auto kernel_num = _kernels.size() + (_kernel_idx++);
    kernel_id id = kernel_string->entry_point + "_" + std::to_string(kernel_num);
    auto res = _kernels_code.emplace(kernel_string, id, dump_custom_program);
--- a/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp
+++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp
@ -75,6 +75,7 @@ private:
    engine& _engine;
    uint32_t _prog_id = 0;
    kernels_code _kernels_code;
    size_t _kernel_idx = 0;
    std::atomic<bool> _pending_compilation{false};
    std::map<const std::string, kernel::ptr> _kernels;
    std::vector<std::string> batch_header_str;
@ -97,6 +98,9 @@ public:
    // forces compilation of all pending kernels/programs
    void build_all();
    void reset();
    void remove_kernel(kernel_id id) {
        _kernels.erase(id);
    }
 };
 }  // namespace cldnn
--- a/src/plugins/intel_gpu/tests/test_cases/lru_caches_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/test_cases/lru_caches_gpu_test.cpp
@ -0,0 +1,124 @@
 // Copyright (C) 2018-2022 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "test_utils.h"
 #include "intel_gpu/runtime/lru_cache.hpp"
 #include <vector>
 using namespace cldnn;
 using namespace ::tests;
 TEST(lru_cache, basic_data_type)
 {
    const size_t cap = 4;
    LruCache<int, int> ca(cap * sizeof(int));
    std::vector<int> inputs = {1, 2, 3, 4, 2, 1, 5};
    std::vector<std::pair<int, int>> input_values;
    for (auto i :  inputs) {
        input_values.push_back(std::make_pair(i, i + 10));
    }
    EXPECT_EQ(ca.get_lru_element(), int());
    std::vector<bool> expected_hitted = {false, false, false, false, true, true, false};
    for (size_t i = 0; i < input_values.size(); i++) {
        auto& in = input_values[i];
        int data = 0;
        bool hitted = ca.has(in.first);
        if (hitted) {
            data = ca.get(in.first);
        } else {
            ca.add(in.first, in.second);
            data = ca.get(in.first);
        }
        EXPECT_EQ(data, in.second);
        EXPECT_EQ(hitted, (bool)expected_hitted[i]);
    }
    std::vector<std::pair<int, int>> expected_value;
    for (size_t i = ca.size(); i > 0; i--) {  // 5, 1, 2, 4
        int idx = input_values.size() - i;
        expected_value.push_back(input_values[idx]);
    }
    int idx = expected_value.size() - 1;
    for (auto key : ca.get_all_keys()) {
        EXPECT_EQ(key, expected_value[idx--].first);
    }
 }
 class lru_cache_test_data {
 public:
    lru_cache_test_data(int a, int b, int c) : x(a), y(b), z(c) {
        key = "key_" + std::to_string(a) + "_" + std::to_string(b) + "_" + std::to_string(c);
    }
    bool operator==(const lru_cache_test_data&rhs) {
        return (this->x == rhs.x && this->y == rhs.y && this->z == rhs.z);
    }
    bool operator!=(const lru_cache_test_data&rhs) {
        return (this->x != rhs.x || this->y != rhs.y || this->z != rhs.z);
    }
    operator std::string() {
        return "(" + std::to_string(x) + "," + std::to_string(y) + "," + std::to_string(z) + ")";
    }
    std::string key;
    int x;
    int y;
    int z;
 };
 using test_impl_cache = LruCache<std::string, std::shared_ptr<lru_cache_test_data>>;
 TEST(lru_cache, custom_data_type) {
    const size_t cap = 4;
    test_impl_cache ca(cap);
    std::vector<std::shared_ptr<lru_cache_test_data>> inputs;
    inputs.push_back(std::make_shared<lru_cache_test_data>(1, 21, 11));
    inputs.push_back(std::make_shared<lru_cache_test_data>(2, 22, 12));
    inputs.push_back(std::make_shared<lru_cache_test_data>(3, 23, 13));
    inputs.push_back(std::make_shared<lru_cache_test_data>(4, 24, 14));
    inputs.push_back(std::make_shared<lru_cache_test_data>(2, 22, 12));
    inputs.push_back(std::make_shared<lru_cache_test_data>(1, 21, 11));
    inputs.push_back(std::make_shared<lru_cache_test_data>(3, 23, 13));
    inputs.push_back(std::make_shared<lru_cache_test_data>(5, 25, 15));
    std::vector<bool> expected_hitted = {false, false, false, false, true, true, true, false};
    EXPECT_EQ(ca.get_lru_element(), std::shared_ptr<lru_cache_test_data>());
    for (size_t i = 0; i < inputs.size(); i++) {
        auto& in = inputs[i];
        std::shared_ptr<lru_cache_test_data> p_data;
        bool hitted = ca.has(in->key);
        if (hitted) {
            p_data = ca.get(in->key);
        } else {
            ca.add(in->key, in);
            p_data = ca.get(in->key);
        }
        EXPECT_EQ(p_data->key, in->key);
        EXPECT_EQ(hitted, (bool)expected_hitted[i]);
    }
    EXPECT_EQ(cap, ca.size());
    std::vector<std::string> expected_keys;
    for (size_t i = cap; i > 0; i--) {
        expected_keys.push_back(inputs[inputs.size() - i]->key);
    }
    int idx = expected_keys.size() - 1;
    for (auto key : ca.get_all_keys()) {
        EXPECT_EQ(key, expected_keys[idx--]);
    }
 }