[dGPU] Enable user scratchpad mode for onednn primitive. (#18699)
* [dGPU] Enable user scratchpad mode. * Reuse intermediate buffer. * Add own id to the memory dependencies at the c-tor of program_node + Allocate intermediate memory with memory_pool::get_memory() function. + Assign scratchpad memory desc in load() function for onednn primitive serialization * Allocate device mem for onednn scratchpad mem
This commit is contained in:
parent
63ac68e745
commit
c0783f16ed
@ -78,7 +78,7 @@ struct memory {
|
||||
virtual event::ptr copy_to(stream& /* stream */, void* /* host_ptr */, bool blocking = true) = 0;
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
virtual dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) {
|
||||
virtual dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) const {
|
||||
throw std::runtime_error("[CLDNN] Can't convert memory object to onednn");
|
||||
}
|
||||
#endif
|
||||
|
@ -124,7 +124,6 @@ public:
|
||||
const primitive_id& id,
|
||||
uint32_t network_id,
|
||||
allocation_type type);
|
||||
void clear_pool();
|
||||
void clear_pool_for_network(uint32_t network_id);
|
||||
void release_memory(memory* memory, const primitive_id& id, uint32_t network_id);
|
||||
};
|
||||
|
@ -108,6 +108,8 @@ public:
|
||||
std::vector<uint8_t> prim_cache;
|
||||
ib >> prim_cache;
|
||||
|
||||
_scratchpad_md = _pd.scratchpad_desc();
|
||||
|
||||
_prim = dnnl::concat(_pd, prim_cache);
|
||||
#endif
|
||||
}
|
||||
|
@ -213,6 +213,8 @@ public:
|
||||
_pd = *prim_desc;
|
||||
}
|
||||
|
||||
_scratchpad_md = _pd.scratchpad_desc();
|
||||
|
||||
std::vector<uint8_t> prim_cache;
|
||||
ib >> prim_cache;
|
||||
|
||||
|
@ -130,6 +130,8 @@ public:
|
||||
std::vector<uint8_t> prim_cache;
|
||||
ib >> prim_cache;
|
||||
|
||||
_scratchpad_md = _pd.scratchpad_desc();
|
||||
|
||||
_prim = dnnl::primitive(_pd, prim_cache);
|
||||
#endif
|
||||
}
|
||||
|
@ -168,6 +168,8 @@ public:
|
||||
std::vector<uint8_t> prim_cache;
|
||||
ib >> prim_cache;
|
||||
|
||||
_scratchpad_md = _pd.scratchpad_desc();
|
||||
|
||||
_prim = dnnl::primitive(_pd, prim_cache);
|
||||
#endif
|
||||
}
|
||||
|
@ -298,6 +298,8 @@ public:
|
||||
std::vector<uint8_t> prim_cache;
|
||||
ib >> prim_cache;
|
||||
|
||||
_scratchpad_md = _pd.scratchpad_desc();
|
||||
|
||||
_prim = dnnl::primitive(_pd, prim_cache);
|
||||
#endif
|
||||
}
|
||||
|
@ -144,6 +144,8 @@ public:
|
||||
std::vector<uint8_t> prim_cache;
|
||||
ib >> prim_cache;
|
||||
|
||||
_scratchpad_md = _pd.scratchpad_desc();
|
||||
|
||||
_prim = dnnl::primitive(_pd, prim_cache);
|
||||
#endif
|
||||
}
|
||||
|
@ -41,6 +41,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
|
||||
PrimDescType _pd;
|
||||
PrimType _prim;
|
||||
std::unordered_map<uint32_t, std::unordered_map<int, dnnl::memory>> _args;
|
||||
dnnl::memory::desc _scratchpad_md;
|
||||
bool _enable_profiling = false;
|
||||
|
||||
typed_primitive_onednn_impl(const engine& engine,
|
||||
@ -53,6 +54,24 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
|
||||
_attrs(attrs),
|
||||
_pd(pd) {
|
||||
_enable_profiling = config.get_property(ov::enable_profiling);
|
||||
|
||||
_scratchpad_md = _pd.scratchpad_desc();
|
||||
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) {
|
||||
_enable_profiling = true;
|
||||
}
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
if (_scratchpad_md.get_size() > 0) {
|
||||
static std::atomic_llong total{0};
|
||||
int64_t size = _scratchpad_md.get_size() / 1048576;
|
||||
total += size;
|
||||
GPU_DEBUG_TRACE_DETAIL << " [scratchpad] kind: " << static_cast<int>(_pd.get_kind())
|
||||
<< ", " << size << "MB, total " << total << "MB" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
build_primitive(config);
|
||||
}
|
||||
|
||||
@ -189,7 +208,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
|
||||
|
||||
if (has_attrs) {
|
||||
{
|
||||
dnnl::scratchpad_mode _scratchpad_mode = dnnl::scratchpad_mode::library;
|
||||
dnnl::scratchpad_mode _scratchpad_mode = dnnl::scratchpad_mode::user;
|
||||
ib >> make_data(&_scratchpad_mode, sizeof(dnnl::scratchpad_mode));
|
||||
_attrs->set_scratchpad_mode(_scratchpad_mode);
|
||||
}
|
||||
@ -450,6 +469,12 @@ protected:
|
||||
args.insert({DNNL_ARG_DST, output.get_onednn_memory(_pd.dnnl::primitive_desc_base::dst_desc(0), offset)});
|
||||
}
|
||||
|
||||
if (_scratchpad_md.get_size() != 0) {
|
||||
// onednn primitive can have only 1 scratchpad memory.
|
||||
auto scratchpad = instance.get_intermediates_memories()[0];
|
||||
args.insert({DNNL_ARG_SCRATCHPAD, scratchpad->get_onednn_memory(_scratchpad_md, 0)});
|
||||
}
|
||||
|
||||
configure_post_ops_arguments(instance, args);
|
||||
|
||||
return args;
|
||||
@ -511,6 +536,12 @@ protected:
|
||||
|
||||
return event;
|
||||
}
|
||||
|
||||
std::vector<layout> get_internal_buffer_layouts_impl() const override {
|
||||
if (_scratchpad_md.get_size() == 0)
|
||||
return {};
|
||||
return {{{1, 1, 1, (tensor::value_type)(_scratchpad_md.get_size())}, cldnn::data_types::u8, format::bfyx}};
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace onednn
|
||||
|
@ -143,6 +143,8 @@ public:
|
||||
std::vector<uint8_t> prim_cache;
|
||||
ib >> prim_cache;
|
||||
|
||||
_scratchpad_md = _pd.scratchpad_desc();
|
||||
|
||||
_prim = dnnl::primitive(_pd, prim_cache);
|
||||
#endif
|
||||
}
|
||||
|
@ -95,6 +95,8 @@ public:
|
||||
std::vector<uint8_t> prim_cache;
|
||||
ib >> prim_cache;
|
||||
|
||||
_scratchpad_md = _pd.scratchpad_desc();
|
||||
|
||||
_prim = dnnl::reorder(_pd, prim_cache);
|
||||
#endif
|
||||
}
|
||||
|
@ -988,7 +988,9 @@ memory::ptr primitive_inst::allocate_internal_buffer(size_t idx, bool reset) {
|
||||
auto layout = ibuf_layouts[idx];
|
||||
GPU_DEBUG_LOG << "[" << _node->id() << ": internal buf " << idx << "]" << std::endl;
|
||||
auto alloc_type = allocation_type::unknown;
|
||||
if (input_device_mem && ((int64_t) available_device_mem_size - (int64_t)layout.bytes_count() >= 0)) {
|
||||
if ((int64_t)available_device_mem_size - (int64_t)layout.bytes_count() >= 0 &&
|
||||
(input_device_mem || _node->get_preferred_impl_type() == impl_types::onednn)) {
|
||||
// scratchpad memory type enforces to device mem.
|
||||
GPU_DEBUG_LOG << " input is device mem and available device mem size (" << available_device_mem_size
|
||||
<< ") > requested memory (" << layout.bytes_count() << " )" << std::endl;
|
||||
alloc_type = engine.get_preferred_memory_allocation_type();
|
||||
@ -998,7 +1000,12 @@ memory::ptr primitive_inst::allocate_internal_buffer(size_t idx, bool reset) {
|
||||
alloc_type = engine.get_lockable_preferred_memory_allocation_type();
|
||||
}
|
||||
GPU_DEBUG_LOG << "=> allocate to " << alloc_type << std::endl;
|
||||
return engine.allocate_memory(layout, alloc_type, reset);
|
||||
|
||||
// Reuse intermediate buffer like output buffer.
|
||||
auto ret_mem = _network.get_memory_pool().get_memory(layout, _node->id(), _network.get_id(), _node->get_memory_dependencies(), alloc_type, true, reset);
|
||||
GPU_DEBUG_LOG << " [" << _network.get_id() << ":" << _node->id() << ": internal buf " << idx << "] " << alloc_type
|
||||
<< " " << ret_mem->buffer_ptr() << std::endl;
|
||||
return ret_mem;
|
||||
}
|
||||
|
||||
void primitive_inst::allocate_internal_buffers(bool reset) {
|
||||
@ -1681,7 +1688,8 @@ void primitive_inst::load(cldnn::BinaryInputBuffer& ib) {
|
||||
allocation_type _allocation_type;
|
||||
ib >> make_data(&_allocation_type, sizeof(_allocation_type));
|
||||
|
||||
_intermediates_memory[i] = get_network().get_engine().allocate_memory(ibuf_layout, _allocation_type);
|
||||
_intermediates_memory[i] = get_network().get_memory_pool().get_memory(ibuf_layout, id(), get_network_id(),
|
||||
_node_mem_deps, _allocation_type, true, true);
|
||||
}
|
||||
|
||||
bool has_impl;
|
||||
|
@ -42,6 +42,7 @@ program_node::program_node(std::shared_ptr<primitive> prim, program& prog)
|
||||
output_layouts.push_back(output_layout);
|
||||
valid_output_layouts.push_back(false);
|
||||
}
|
||||
add_memory_dependency(id());
|
||||
}
|
||||
}
|
||||
|
||||
@ -944,6 +945,10 @@ void program_node::init_onednn_primitive_attributes() {
|
||||
// Added this for debug purposes only
|
||||
size_t empty_mem = 0xff;
|
||||
|
||||
// Change scratchpad mode to user
|
||||
if (attrs->get_scratchpad_mode() == dnnl::scratchpad_mode::library)
|
||||
attrs->set_scratchpad_mode(dnnl::scratchpad_mode::user);
|
||||
|
||||
// Add information about post-operation into the list, update indices
|
||||
auto update_onednn_post_op_list = [&](onednn_post_op_type type, size_t m_dep,
|
||||
dnnl::memory::format_tag tag = dnnl::memory::format_tag::undef,
|
||||
|
@ -240,8 +240,6 @@ memory::ptr memory_pool::get_memory(const layout& layout,
|
||||
}
|
||||
}
|
||||
|
||||
void memory_pool::clear_pool() { _non_padded_pool.clear(); }
|
||||
|
||||
void memory_pool::clear_pool_for_network(uint32_t network_id) {
|
||||
// free up _non_padded_pool for this network
|
||||
{
|
||||
|
@ -136,7 +136,7 @@ event::ptr gpu_buffer::copy_to(stream& stream, void* host_ptr, bool blocking) {
|
||||
}
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
dnnl::memory gpu_buffer::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) {
|
||||
dnnl::memory gpu_buffer::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) const {
|
||||
auto onednn_engine = _engine->get_onednn_engine();
|
||||
dnnl::memory dnnl_mem(desc, onednn_engine, DNNL_MEMORY_NONE);
|
||||
dnnl::ocl_interop::set_mem_object(dnnl_mem, _buffer.get());
|
||||
@ -482,7 +482,7 @@ event::ptr gpu_usm::copy_to(stream& stream, void* host_ptr, bool blocking) {
|
||||
}
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
dnnl::memory gpu_usm::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) {
|
||||
dnnl::memory gpu_usm::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) const {
|
||||
auto onednn_engine = _engine->get_onednn_engine();
|
||||
dnnl::memory dnnl_mem = dnnl::ocl_interop::make_memory(desc, onednn_engine, dnnl::ocl_interop::memory_kind::usm,
|
||||
reinterpret_cast<uint8_t*>(_buffer.get()) + offset);
|
||||
|
@ -46,7 +46,7 @@ struct gpu_buffer : public lockable_gpu_mem, public memory {
|
||||
event::ptr copy_to(stream& stream, void* other , bool blocking) override;
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) override;
|
||||
dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) const override;
|
||||
#endif
|
||||
|
||||
protected:
|
||||
@ -124,7 +124,7 @@ struct gpu_usm : public lockable_gpu_mem, public memory {
|
||||
|
||||
event::ptr copy_to(stream& stream, void* host_ptr, bool blocking) override;
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) override;
|
||||
dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) const override;
|
||||
#endif
|
||||
|
||||
static allocation_type detect_allocation_type(const ocl_engine* engine, const void* mem_ptr);
|
||||
|
@ -12,6 +12,8 @@
|
||||
#include "intel_gpu/primitives/concatenation.hpp"
|
||||
#include "intel_gpu/primitives/reorder.hpp"
|
||||
#include "intel_gpu/primitives/reshape.hpp"
|
||||
#include "intel_gpu/primitives/fully_connected.hpp"
|
||||
#include "primitive_inst.h"
|
||||
|
||||
#include "runtime/ocl/ocl_event.hpp"
|
||||
|
||||
@ -208,4 +210,44 @@ TEST(network_test, has_proper_event_for_in_order_queue_onednn) {
|
||||
ASSERT_TRUE(downcast<ocl::ocl_base_event>(reorder_ev.get())->get().get() != nullptr);
|
||||
ASSERT_TRUE(downcast<ocl::ocl_base_event>(activation_ev.get())->get().get() != nullptr);
|
||||
}
|
||||
|
||||
TEST(network_test, scratchpad_test) {
|
||||
auto& engine = get_test_engine();
|
||||
if (!engine.get_device_info().supports_immad)
|
||||
return;
|
||||
|
||||
// benchdnn parameters:
|
||||
// --ip --engine=gpu:0 --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-scratchpad=user mb16384ic768ih1iw1oc3072
|
||||
layout in_layout{{16384, 768}, data_types::f16, format::bfyx};
|
||||
auto input_mem = engine.allocate_memory(in_layout);
|
||||
auto weights = engine.allocate_memory({{3072, 768}, data_types::f16, format::oiyx});
|
||||
|
||||
topology topology;
|
||||
topology.add(input_layout("input", in_layout));
|
||||
topology.add(data("weights", weights));
|
||||
topology.add(fully_connected("fc_prim", input_info("input"), "weights", "", padding()));
|
||||
|
||||
auto impl_desc_onednn = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::onednn};
|
||||
auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"fc_prim", impl_desc_onednn}};
|
||||
|
||||
auto config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::optimize_data(false));
|
||||
config.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
config.set_property(ov::intel_gpu::force_implementations(impl_forcing_map));
|
||||
|
||||
network net1(engine, topology, config);
|
||||
net1.set_input_data("input", input_mem);
|
||||
net1.execute();
|
||||
|
||||
network net2(net1.get_program(), 1);
|
||||
net2.set_input_data("input", input_mem);
|
||||
net2.execute();
|
||||
|
||||
auto fc1 = net1.get_primitive("fc_prim");
|
||||
auto fc2 = net2.get_primitive("fc_prim");
|
||||
|
||||
ASSERT_TRUE(fc1->get_intermediates_memories()[0]->buffer_ptr() != fc2->get_intermediates_memories()[0]->buffer_ptr());
|
||||
}
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user