[dGPU] Enable user scratchpad mode for onednn primitive. (#18699)

* [dGPU] Enable user scratchpad mode.
* Reuse intermediate buffer.
* Add own id to the memory dependencies at the c-tor of program_node
+ Allocate intermediate memory with memory_pool::get_memory() function.
+ Assign scratchpad memory desc in load() function for onednn primitive
serialization
* Allocate device mem for onednn scratchpad mem
This commit is contained in:
Jade Cho 2023-07-30 23:13:45 +09:00 committed by GitHub
parent 63ac68e745
commit c0783f16ed
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 111 additions and 12 deletions

View File

@ -78,7 +78,7 @@ struct memory {
virtual event::ptr copy_to(stream& /* stream */, void* /* host_ptr */, bool blocking = true) = 0;
#ifdef ENABLE_ONEDNN_FOR_GPU
virtual dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) {
virtual dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) const {
throw std::runtime_error("[CLDNN] Can't convert memory object to onednn");
}
#endif

View File

@ -124,7 +124,6 @@ public:
const primitive_id& id,
uint32_t network_id,
allocation_type type);
void clear_pool();
void clear_pool_for_network(uint32_t network_id);
void release_memory(memory* memory, const primitive_id& id, uint32_t network_id);
};

View File

@ -108,6 +108,8 @@ public:
std::vector<uint8_t> prim_cache;
ib >> prim_cache;
_scratchpad_md = _pd.scratchpad_desc();
_prim = dnnl::concat(_pd, prim_cache);
#endif
}

View File

@ -213,6 +213,8 @@ public:
_pd = *prim_desc;
}
_scratchpad_md = _pd.scratchpad_desc();
std::vector<uint8_t> prim_cache;
ib >> prim_cache;

View File

@ -130,6 +130,8 @@ public:
std::vector<uint8_t> prim_cache;
ib >> prim_cache;
_scratchpad_md = _pd.scratchpad_desc();
_prim = dnnl::primitive(_pd, prim_cache);
#endif
}

View File

@ -168,6 +168,8 @@ public:
std::vector<uint8_t> prim_cache;
ib >> prim_cache;
_scratchpad_md = _pd.scratchpad_desc();
_prim = dnnl::primitive(_pd, prim_cache);
#endif
}

View File

@ -298,6 +298,8 @@ public:
std::vector<uint8_t> prim_cache;
ib >> prim_cache;
_scratchpad_md = _pd.scratchpad_desc();
_prim = dnnl::primitive(_pd, prim_cache);
#endif
}

View File

@ -144,6 +144,8 @@ public:
std::vector<uint8_t> prim_cache;
ib >> prim_cache;
_scratchpad_md = _pd.scratchpad_desc();
_prim = dnnl::primitive(_pd, prim_cache);
#endif
}

View File

@ -41,6 +41,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
PrimDescType _pd;
PrimType _prim;
std::unordered_map<uint32_t, std::unordered_map<int, dnnl::memory>> _args;
dnnl::memory::desc _scratchpad_md;
bool _enable_profiling = false;
typed_primitive_onednn_impl(const engine& engine,
@ -53,6 +54,24 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
_attrs(attrs),
_pd(pd) {
_enable_profiling = config.get_property(ov::enable_profiling);
_scratchpad_md = _pd.scratchpad_desc();
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) {
_enable_profiling = true;
}
GPU_DEBUG_IF(debug_config->verbose >= 4) {
if (_scratchpad_md.get_size() > 0) {
static std::atomic_llong total{0};
int64_t size = _scratchpad_md.get_size() / 1048576;
total += size;
GPU_DEBUG_TRACE_DETAIL << " [scratchpad] kind: " << static_cast<int>(_pd.get_kind())
<< ", " << size << "MB, total " << total << "MB" << std::endl;
}
}
build_primitive(config);
}
@ -189,7 +208,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
if (has_attrs) {
{
dnnl::scratchpad_mode _scratchpad_mode = dnnl::scratchpad_mode::library;
dnnl::scratchpad_mode _scratchpad_mode = dnnl::scratchpad_mode::user;
ib >> make_data(&_scratchpad_mode, sizeof(dnnl::scratchpad_mode));
_attrs->set_scratchpad_mode(_scratchpad_mode);
}
@ -450,6 +469,12 @@ protected:
args.insert({DNNL_ARG_DST, output.get_onednn_memory(_pd.dnnl::primitive_desc_base::dst_desc(0), offset)});
}
if (_scratchpad_md.get_size() != 0) {
// onednn primitive can have only 1 scratchpad memory.
auto scratchpad = instance.get_intermediates_memories()[0];
args.insert({DNNL_ARG_SCRATCHPAD, scratchpad->get_onednn_memory(_scratchpad_md, 0)});
}
configure_post_ops_arguments(instance, args);
return args;
@ -511,6 +536,12 @@ protected:
return event;
}
std::vector<layout> get_internal_buffer_layouts_impl() const override {
if (_scratchpad_md.get_size() == 0)
return {};
return {{{1, 1, 1, (tensor::value_type)(_scratchpad_md.get_size())}, cldnn::data_types::u8, format::bfyx}};
}
};
} // namespace onednn

View File

@ -143,6 +143,8 @@ public:
std::vector<uint8_t> prim_cache;
ib >> prim_cache;
_scratchpad_md = _pd.scratchpad_desc();
_prim = dnnl::primitive(_pd, prim_cache);
#endif
}

View File

@ -95,6 +95,8 @@ public:
std::vector<uint8_t> prim_cache;
ib >> prim_cache;
_scratchpad_md = _pd.scratchpad_desc();
_prim = dnnl::reorder(_pd, prim_cache);
#endif
}

View File

@ -988,7 +988,9 @@ memory::ptr primitive_inst::allocate_internal_buffer(size_t idx, bool reset) {
auto layout = ibuf_layouts[idx];
GPU_DEBUG_LOG << "[" << _node->id() << ": internal buf " << idx << "]" << std::endl;
auto alloc_type = allocation_type::unknown;
if (input_device_mem && ((int64_t) available_device_mem_size - (int64_t)layout.bytes_count() >= 0)) {
if ((int64_t)available_device_mem_size - (int64_t)layout.bytes_count() >= 0 &&
(input_device_mem || _node->get_preferred_impl_type() == impl_types::onednn)) {
// scratchpad memory type enforces to device mem.
GPU_DEBUG_LOG << " input is device mem and available device mem size (" << available_device_mem_size
<< ") > requested memory (" << layout.bytes_count() << " )" << std::endl;
alloc_type = engine.get_preferred_memory_allocation_type();
@ -998,7 +1000,12 @@ memory::ptr primitive_inst::allocate_internal_buffer(size_t idx, bool reset) {
alloc_type = engine.get_lockable_preferred_memory_allocation_type();
}
GPU_DEBUG_LOG << "=> allocate to " << alloc_type << std::endl;
return engine.allocate_memory(layout, alloc_type, reset);
// Reuse intermediate buffer like output buffer.
auto ret_mem = _network.get_memory_pool().get_memory(layout, _node->id(), _network.get_id(), _node->get_memory_dependencies(), alloc_type, true, reset);
GPU_DEBUG_LOG << " [" << _network.get_id() << ":" << _node->id() << ": internal buf " << idx << "] " << alloc_type
<< " " << ret_mem->buffer_ptr() << std::endl;
return ret_mem;
}
void primitive_inst::allocate_internal_buffers(bool reset) {
@ -1681,7 +1688,8 @@ void primitive_inst::load(cldnn::BinaryInputBuffer& ib) {
allocation_type _allocation_type;
ib >> make_data(&_allocation_type, sizeof(_allocation_type));
_intermediates_memory[i] = get_network().get_engine().allocate_memory(ibuf_layout, _allocation_type);
_intermediates_memory[i] = get_network().get_memory_pool().get_memory(ibuf_layout, id(), get_network_id(),
_node_mem_deps, _allocation_type, true, true);
}
bool has_impl;

View File

@ -42,6 +42,7 @@ program_node::program_node(std::shared_ptr<primitive> prim, program& prog)
output_layouts.push_back(output_layout);
valid_output_layouts.push_back(false);
}
add_memory_dependency(id());
}
}
@ -944,6 +945,10 @@ void program_node::init_onednn_primitive_attributes() {
// Added this for debug purposes only
size_t empty_mem = 0xff;
// Change scratchpad mode to user
if (attrs->get_scratchpad_mode() == dnnl::scratchpad_mode::library)
attrs->set_scratchpad_mode(dnnl::scratchpad_mode::user);
// Add information about post-operation into the list, update indices
auto update_onednn_post_op_list = [&](onednn_post_op_type type, size_t m_dep,
dnnl::memory::format_tag tag = dnnl::memory::format_tag::undef,

View File

@ -240,8 +240,6 @@ memory::ptr memory_pool::get_memory(const layout& layout,
}
}
void memory_pool::clear_pool() { _non_padded_pool.clear(); }
void memory_pool::clear_pool_for_network(uint32_t network_id) {
// free up _non_padded_pool for this network
{

View File

@ -136,7 +136,7 @@ event::ptr gpu_buffer::copy_to(stream& stream, void* host_ptr, bool blocking) {
}
#ifdef ENABLE_ONEDNN_FOR_GPU
dnnl::memory gpu_buffer::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) {
dnnl::memory gpu_buffer::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) const {
auto onednn_engine = _engine->get_onednn_engine();
dnnl::memory dnnl_mem(desc, onednn_engine, DNNL_MEMORY_NONE);
dnnl::ocl_interop::set_mem_object(dnnl_mem, _buffer.get());
@ -482,7 +482,7 @@ event::ptr gpu_usm::copy_to(stream& stream, void* host_ptr, bool blocking) {
}
#ifdef ENABLE_ONEDNN_FOR_GPU
dnnl::memory gpu_usm::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) {
dnnl::memory gpu_usm::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) const {
auto onednn_engine = _engine->get_onednn_engine();
dnnl::memory dnnl_mem = dnnl::ocl_interop::make_memory(desc, onednn_engine, dnnl::ocl_interop::memory_kind::usm,
reinterpret_cast<uint8_t*>(_buffer.get()) + offset);

View File

@ -46,7 +46,7 @@ struct gpu_buffer : public lockable_gpu_mem, public memory {
event::ptr copy_to(stream& stream, void* other , bool blocking) override;
#ifdef ENABLE_ONEDNN_FOR_GPU
dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) override;
dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) const override;
#endif
protected:
@ -124,7 +124,7 @@ struct gpu_usm : public lockable_gpu_mem, public memory {
event::ptr copy_to(stream& stream, void* host_ptr, bool blocking) override;
#ifdef ENABLE_ONEDNN_FOR_GPU
dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) override;
dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) const override;
#endif
static allocation_type detect_allocation_type(const ocl_engine* engine, const void* mem_ptr);

View File

@ -12,6 +12,8 @@
#include "intel_gpu/primitives/concatenation.hpp"
#include "intel_gpu/primitives/reorder.hpp"
#include "intel_gpu/primitives/reshape.hpp"
#include "intel_gpu/primitives/fully_connected.hpp"
#include "primitive_inst.h"
#include "runtime/ocl/ocl_event.hpp"
@ -208,4 +210,44 @@ TEST(network_test, has_proper_event_for_in_order_queue_onednn) {
ASSERT_TRUE(downcast<ocl::ocl_base_event>(reorder_ev.get())->get().get() != nullptr);
ASSERT_TRUE(downcast<ocl::ocl_base_event>(activation_ev.get())->get().get() != nullptr);
}
TEST(network_test, scratchpad_test) {
auto& engine = get_test_engine();
if (!engine.get_device_info().supports_immad)
return;
// benchdnn parameters:
// --ip --engine=gpu:0 --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-scratchpad=user mb16384ic768ih1iw1oc3072
layout in_layout{{16384, 768}, data_types::f16, format::bfyx};
auto input_mem = engine.allocate_memory(in_layout);
auto weights = engine.allocate_memory({{3072, 768}, data_types::f16, format::oiyx});
topology topology;
topology.add(input_layout("input", in_layout));
topology.add(data("weights", weights));
topology.add(fully_connected("fc_prim", input_info("input"), "weights", "", padding()));
auto impl_desc_onednn = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::onednn};
auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"fc_prim", impl_desc_onednn}};
auto config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::optimize_data(false));
config.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
config.set_property(ov::intel_gpu::force_implementations(impl_forcing_map));
network net1(engine, topology, config);
net1.set_input_data("input", input_mem);
net1.execute();
network net2(net1.get_program(), 1);
net2.set_input_data("input", input_mem);
net2.execute();
auto fc1 = net1.get_primitive("fc_prim");
auto fc2 = net2.get_primitive("fc_prim");
ASSERT_TRUE(fc1->get_intermediates_memories()[0]->buffer_ptr() != fc2->get_intermediates_memories()[0]->buffer_ptr());
}
#endif