[dGPU] Enable user scratchpad mode for onednn primitive. (#18699)

* [dGPU] Enable user scratchpad mode. * Reuse intermediate buffer. * Add own id to the memory dependencies at the c-tor of program_node + Allocate intermediate memory with memory_pool::get_memory() function. + Assign scratchpad memory desc in load() function for onednn primitive serialization * Allocate device mem for onednn scratchpad mem
2023-07-30 23:13:45 +09:00 · 2023-07-30 23:13:45 +09:00 · c0783f16ed
commit c0783f16ed
parent 63ac68e745
17 changed files with 111 additions and 12 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp
@ -78,7 +78,7 @@ struct memory {
    virtual event::ptr copy_to(stream& /* stream */, void* /* host_ptr */, bool blocking = true) = 0;

 #ifdef ENABLE_ONEDNN_FOR_GPU
-    virtual dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) {
+    virtual dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) const {
        throw std::runtime_error("[CLDNN] Can't convert memory object to onednn");
    }
 #endif
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_pool.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_pool.hpp
@ -124,7 +124,6 @@ public:
                                             const primitive_id& id,
                                             uint32_t network_id,
                                             allocation_type type);
-    void clear_pool();
    void clear_pool_for_network(uint32_t network_id);
    void release_memory(memory* memory, const primitive_id& id, uint32_t network_id);
 };
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/concatenation_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/concatenation_onednn.cpp
@ -108,6 +108,8 @@ public:
        std::vector<uint8_t> prim_cache;
        ib >> prim_cache;

+        _scratchpad_md = _pd.scratchpad_desc();
+
        _prim = dnnl::concat(_pd, prim_cache);
 #endif
    }
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp
@ -213,6 +213,8 @@ public:
            _pd = *prim_desc;
        }

+        _scratchpad_md = _pd.scratchpad_desc();
+
        std::vector<uint8_t> prim_cache;
        ib >> prim_cache;

--- a/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.cpp
@ -130,6 +130,8 @@ public:
        std::vector<uint8_t> prim_cache;
        ib >> prim_cache;

+        _scratchpad_md = _pd.scratchpad_desc();
+
        _prim = dnnl::primitive(_pd, prim_cache);
 #endif
    }
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp
@ -168,6 +168,8 @@ public:
        std::vector<uint8_t> prim_cache;
        ib >> prim_cache;

+        _scratchpad_md = _pd.scratchpad_desc();
+
        _prim = dnnl::primitive(_pd, prim_cache);
 #endif
    }
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp
@ -298,6 +298,8 @@ public:
        std::vector<uint8_t> prim_cache;
        ib >> prim_cache;

+        _scratchpad_md = _pd.scratchpad_desc();
+
        _prim = dnnl::primitive(_pd, prim_cache);
 #endif
    }
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/pooling_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/pooling_onednn.cpp
@ -144,6 +144,8 @@ public:
        std::vector<uint8_t> prim_cache;
        ib >> prim_cache;

+        _scratchpad_md = _pd.scratchpad_desc();
+
        _prim = dnnl::primitive(_pd, prim_cache);
 #endif
    }
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h
@ -41,6 +41,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
    PrimDescType _pd;
    PrimType _prim;
    std::unordered_map<uint32_t, std::unordered_map<int, dnnl::memory>> _args;
+    dnnl::memory::desc _scratchpad_md;
    bool _enable_profiling = false;

    typed_primitive_onednn_impl(const engine& engine,
@ -53,6 +54,24 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
        _attrs(attrs),
        _pd(pd) {
            _enable_profiling = config.get_property(ov::enable_profiling);
+
+            _scratchpad_md = _pd.scratchpad_desc();
+
+            GPU_DEBUG_GET_INSTANCE(debug_config);
+            GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) {
+                _enable_profiling = true;
+            }
+
+            GPU_DEBUG_IF(debug_config->verbose >= 4) {
+                if (_scratchpad_md.get_size() > 0) {
+                    static std::atomic_llong total{0};
+                    int64_t size = _scratchpad_md.get_size() / 1048576;
+                    total += size;
+                    GPU_DEBUG_TRACE_DETAIL << " [scratchpad] kind: " << static_cast<int>(_pd.get_kind())
+                        << ", " << size << "MB, total " << total << "MB" << std::endl;
+                }
+            }
+
            build_primitive(config);
        }

@ -189,7 +208,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {

        if (has_attrs) {
            {
-                dnnl::scratchpad_mode _scratchpad_mode = dnnl::scratchpad_mode::library;
+                dnnl::scratchpad_mode _scratchpad_mode = dnnl::scratchpad_mode::user;
                ib >> make_data(&_scratchpad_mode, sizeof(dnnl::scratchpad_mode));
                _attrs->set_scratchpad_mode(_scratchpad_mode);
            }
@ -450,6 +469,12 @@ protected:
            args.insert({DNNL_ARG_DST, output.get_onednn_memory(_pd.dnnl::primitive_desc_base::dst_desc(0), offset)});
        }

+        if (_scratchpad_md.get_size() != 0) {
+            // onednn primitive can have only 1 scratchpad memory.
+            auto scratchpad = instance.get_intermediates_memories()[0];
+            args.insert({DNNL_ARG_SCRATCHPAD, scratchpad->get_onednn_memory(_scratchpad_md, 0)});
+        }
+
        configure_post_ops_arguments(instance, args);

        return args;
@ -511,6 +536,12 @@ protected:

        return event;
    }
+
+    std::vector<layout> get_internal_buffer_layouts_impl() const override {
+        if (_scratchpad_md.get_size() == 0)
+            return {};
+        return {{{1, 1, 1, (tensor::value_type)(_scratchpad_md.get_size())}, cldnn::data_types::u8, format::bfyx}};
+    }
 };

 }  // namespace onednn
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/reduction_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/reduction_onednn.cpp
@ -143,6 +143,8 @@ public:
        std::vector<uint8_t> prim_cache;
        ib >> prim_cache;

+        _scratchpad_md = _pd.scratchpad_desc();
+
        _prim = dnnl::primitive(_pd, prim_cache);
 #endif
    }
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/reorder_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/reorder_onednn.cpp
@ -95,6 +95,8 @@ public:
        std::vector<uint8_t> prim_cache;
        ib >> prim_cache;

+        _scratchpad_md = _pd.scratchpad_desc();
+
        _prim = dnnl::reorder(_pd, prim_cache);
 #endif
    }
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@ -988,7 +988,9 @@ memory::ptr primitive_inst::allocate_internal_buffer(size_t idx, bool reset) {
    auto layout = ibuf_layouts[idx];
    GPU_DEBUG_LOG << "[" << _node->id() << ": internal buf " << idx << "]" << std::endl;
    auto alloc_type = allocation_type::unknown;
-    if (input_device_mem && ((int64_t) available_device_mem_size - (int64_t)layout.bytes_count() >= 0)) {
+    if ((int64_t)available_device_mem_size - (int64_t)layout.bytes_count() >= 0 &&
+        (input_device_mem || _node->get_preferred_impl_type() == impl_types::onednn)) {
+        // scratchpad memory type enforces to device mem.
        GPU_DEBUG_LOG << " input is device mem and available device mem size (" << available_device_mem_size
                      << ") > requested memory (" << layout.bytes_count() << " )" << std::endl;
        alloc_type = engine.get_preferred_memory_allocation_type();
@ -998,7 +1000,12 @@ memory::ptr primitive_inst::allocate_internal_buffer(size_t idx, bool reset) {
        alloc_type = engine.get_lockable_preferred_memory_allocation_type();
    }
    GPU_DEBUG_LOG << "=> allocate to " << alloc_type << std::endl;
-    return engine.allocate_memory(layout, alloc_type, reset);
+
+    // Reuse intermediate buffer like output buffer.
+    auto ret_mem = _network.get_memory_pool().get_memory(layout, _node->id(), _network.get_id(), _node->get_memory_dependencies(), alloc_type, true, reset);
+    GPU_DEBUG_LOG << " [" << _network.get_id() << ":" << _node->id() << ": internal buf " << idx << "] " << alloc_type
+        << " " << ret_mem->buffer_ptr() << std::endl;
+    return ret_mem;
 }

 void primitive_inst::allocate_internal_buffers(bool reset) {
@ -1681,7 +1688,8 @@ void primitive_inst::load(cldnn::BinaryInputBuffer& ib) {
        allocation_type _allocation_type;
        ib >> make_data(&_allocation_type, sizeof(_allocation_type));

-        _intermediates_memory[i] = get_network().get_engine().allocate_memory(ibuf_layout, _allocation_type);
+        _intermediates_memory[i] = get_network().get_memory_pool().get_memory(ibuf_layout, id(), get_network_id(),
+                                                                            _node_mem_deps, _allocation_type, true, true);
    }

    bool has_impl;
--- a/src/plugins/intel_gpu/src/graph/program_node.cpp
+++ b/src/plugins/intel_gpu/src/graph/program_node.cpp
@ -42,6 +42,7 @@ program_node::program_node(std::shared_ptr<primitive> prim, program& prog)
            output_layouts.push_back(output_layout);
            valid_output_layouts.push_back(false);
        }
+        add_memory_dependency(id());
    }
 }

@ -944,6 +945,10 @@ void program_node::init_onednn_primitive_attributes() {
    // Added this for debug purposes only
    size_t empty_mem = 0xff;

+    // Change scratchpad mode to user
+    if (attrs->get_scratchpad_mode() == dnnl::scratchpad_mode::library)
+        attrs->set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
    // Add information about post-operation into the list, update indices
    auto update_onednn_post_op_list = [&](onednn_post_op_type type, size_t m_dep,
                                          dnnl::memory::format_tag tag = dnnl::memory::format_tag::undef,
--- a/src/plugins/intel_gpu/src/runtime/memory_pool.cpp
+++ b/src/plugins/intel_gpu/src/runtime/memory_pool.cpp
@ -240,8 +240,6 @@ memory::ptr memory_pool::get_memory(const layout& layout,
    }
 }

-void memory_pool::clear_pool() { _non_padded_pool.clear(); }
-
 void memory_pool::clear_pool_for_network(uint32_t network_id) {
    // free up _non_padded_pool for this network
    {
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp
@ -136,7 +136,7 @@ event::ptr gpu_buffer::copy_to(stream& stream, void* host_ptr, bool blocking) {
 }

 #ifdef ENABLE_ONEDNN_FOR_GPU
-dnnl::memory gpu_buffer::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) {
+dnnl::memory gpu_buffer::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) const {
    auto onednn_engine = _engine->get_onednn_engine();
    dnnl::memory dnnl_mem(desc, onednn_engine, DNNL_MEMORY_NONE);
    dnnl::ocl_interop::set_mem_object(dnnl_mem, _buffer.get());
@ -482,7 +482,7 @@ event::ptr gpu_usm::copy_to(stream& stream, void* host_ptr, bool blocking) {
 }

 #ifdef ENABLE_ONEDNN_FOR_GPU
-dnnl::memory gpu_usm::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) {
+dnnl::memory gpu_usm::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) const {
    auto onednn_engine = _engine->get_onednn_engine();
    dnnl::memory dnnl_mem = dnnl::ocl_interop::make_memory(desc, onednn_engine, dnnl::ocl_interop::memory_kind::usm,
        reinterpret_cast<uint8_t*>(_buffer.get()) + offset);
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp
@ -46,7 +46,7 @@ struct gpu_buffer : public lockable_gpu_mem, public memory {
    event::ptr copy_to(stream& stream, void* other , bool blocking) override;

 #ifdef ENABLE_ONEDNN_FOR_GPU
-    dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) override;
+    dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) const override;
 #endif

 protected:
@ -124,7 +124,7 @@ struct gpu_usm : public lockable_gpu_mem, public memory {

    event::ptr copy_to(stream& stream, void* host_ptr, bool blocking) override;
 #ifdef ENABLE_ONEDNN_FOR_GPU
-    dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) override;
+    dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) const override;
 #endif

    static allocation_type detect_allocation_type(const ocl_engine* engine, const void* mem_ptr);
--- a/src/plugins/intel_gpu/tests/unit/module_tests/network_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/module_tests/network_test.cpp
@ -12,6 +12,8 @@
 #include "intel_gpu/primitives/concatenation.hpp"
 #include "intel_gpu/primitives/reorder.hpp"
 #include "intel_gpu/primitives/reshape.hpp"
+#include "intel_gpu/primitives/fully_connected.hpp"
+#include "primitive_inst.h"

 #include "runtime/ocl/ocl_event.hpp"

@ -208,4 +210,44 @@ TEST(network_test, has_proper_event_for_in_order_queue_onednn) {
    ASSERT_TRUE(downcast<ocl::ocl_base_event>(reorder_ev.get())->get().get() != nullptr);
    ASSERT_TRUE(downcast<ocl::ocl_base_event>(activation_ev.get())->get().get() != nullptr);
 }
+
+TEST(network_test, scratchpad_test) {
+    auto& engine = get_test_engine();
+    if (!engine.get_device_info().supports_immad)
+        return;
+
+    // benchdnn parameters:
+    // --ip --engine=gpu:0 --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-scratchpad=user mb16384ic768ih1iw1oc3072
+    layout in_layout{{16384, 768}, data_types::f16, format::bfyx};
+    auto input_mem = engine.allocate_memory(in_layout);
+    auto weights = engine.allocate_memory({{3072, 768}, data_types::f16, format::oiyx});
+
+    topology topology;
+    topology.add(input_layout("input", in_layout));
+    topology.add(data("weights", weights));
+    topology.add(fully_connected("fc_prim", input_info("input"), "weights", "", padding()));
+
+    auto impl_desc_onednn = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::onednn};
+    auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"fc_prim", impl_desc_onednn}};
+
+    auto config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::optimize_data(false));
+    config.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    config.set_property(ov::intel_gpu::force_implementations(impl_forcing_map));
+
+    network net1(engine, topology, config);
+    net1.set_input_data("input", input_mem);
+    net1.execute();
+
+    network net2(net1.get_program(), 1);
+    net2.set_input_data("input", input_mem);
+    net2.execute();
+
+    auto fc1 = net1.get_primitive("fc_prim");
+    auto fc2 = net2.get_primitive("fc_prim");
+
+    ASSERT_TRUE(fc1->get_intermediates_memories()[0]->buffer_ptr() != fc2->get_intermediates_memories()[0]->buffer_ptr());
+}
+
 #endif