From d464d079b23d6043b897b54b48bb46c0ccdd5274 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Thu, 24 Nov 2022 10:37:11 +0400 Subject: [PATCH] [GPU] Add blocking option for cldnn::memory::copy_*() functions (#14012) --- .../include/intel_gpu/runtime/memory.hpp | 16 ++-- .../intel_gpu/src/plugin/infer_request.cpp | 8 +- .../src/runtime/ocl/ocl_base_event.hpp | 2 +- .../intel_gpu/src/runtime/ocl/ocl_event.hpp | 4 +- .../intel_gpu/src/runtime/ocl/ocl_memory.cpp | 73 +++++++++++-------- .../intel_gpu/src/runtime/ocl/ocl_memory.hpp | 20 ++--- .../src/runtime/ocl/ocl_user_event.hpp | 2 +- .../tests/module_tests/usm_memory_test.cpp | 2 +- 8 files changed, 70 insertions(+), 57 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp index c2e8d61ac1b..8d2930a1003 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp @@ -68,11 +68,11 @@ struct memory { return true; } - virtual event::ptr copy_from(stream& /* stream */, const memory& /* other */) = 0; - virtual event::ptr copy_from(stream& /* stream */, const void* /* host_ptr */) = 0; + virtual event::ptr copy_from(stream& /* stream */, const memory& /* other */, bool blocking = true) = 0; + virtual event::ptr copy_from(stream& /* stream */, const void* /* host_ptr */, bool blocking = true) = 0; - virtual event::ptr copy_to(stream& stream, memory& other) { return other.copy_from(stream, *this); } - virtual event::ptr copy_to(stream& /* stream */, void* /* host_ptr */) = 0; + virtual event::ptr copy_to(stream& stream, memory& other, bool blocking = true) { return other.copy_from(stream, *this, blocking); } + virtual event::ptr copy_to(stream& /* stream */, void* /* host_ptr */, bool blocking = true) = 0; #ifdef ENABLE_ONEDNN_FOR_GPU virtual dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) { @@ -108,11 +108,11 @@ struct simple_attached_memory : memory { #endif 0}; }; - event::ptr copy_from(stream& /* stream */, const memory& /* other */) override { return nullptr; }; - event::ptr copy_from(stream& /* stream */, const void* /* host_ptr */) override { return nullptr; } + event::ptr copy_from(stream& /* stream */, const memory& /* other */, bool /* blocking */) override { return nullptr; }; + event::ptr copy_from(stream& /* stream */, const void* /* host_ptr */, bool /* blocking */) override { return nullptr; } - event::ptr copy_to(stream& /* stream */, memory& /* other */) override { return nullptr; }; - event::ptr copy_to(stream& /* stream */, void* /* host_ptr */) override { return nullptr; } + event::ptr copy_to(stream& /* stream */, memory& /* other */, bool /* blocking */) override { return nullptr; }; + event::ptr copy_to(stream& /* stream */, void* /* host_ptr */, bool /* blocking */) override { return nullptr; } private: void* _pointer; diff --git a/src/plugins/intel_gpu/src/plugin/infer_request.cpp b/src/plugins/intel_gpu/src/plugin/infer_request.cpp index 8854440f52d..7cb03345367 100644 --- a/src/plugins/intel_gpu/src/plugin/infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/infer_request.cpp @@ -647,8 +647,7 @@ void InferRequest::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst) { OPENVINO_ASSERT(intermediate_output_blob, "[GPU] Intermediate blob for outputs precessing is not allocated"); - auto event = src->copy_to(stream, intermediate_output_blob->buffer()); - event->wait(); + src->copy_to(stream, intermediate_output_blob->buffer()); switch (dst->getTensorDesc().getPrecision()) { #define CASE(PRC, SRC_DT, DST_DT) \ @@ -670,8 +669,7 @@ void InferRequest::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst) { } } else { auto dst_ptr = dst->buffer().as(); - auto event = src->copy_to(stream, dst_ptr); - event->wait(); + src->copy_to(stream, dst_ptr); } } @@ -959,7 +957,7 @@ void InferRequest::prepare_input(const cldnn::primitive_id& inputName, Blob::Ptr auto src_lock = inputBlob->cbuffer(); auto src_ptr = src_lock.as(); if (!same_host_mem(inputMem, src_ptr)) { - auto ev = inputMem->copy_from(stream, src_ptr); + auto ev = inputMem->copy_from(stream, src_ptr, false); dependencies.push_back(ev); } } diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_base_event.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_base_event.hpp index 095bf597d7d..f95a00fd00a 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_base_event.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_base_event.hpp @@ -25,7 +25,7 @@ struct ocl_base_event : public event { public: explicit ocl_base_event(uint64_t queue_stamp = 0) : event(), _queue_stamp(queue_stamp) { } uint64_t get_queue_stamp() const { return _queue_stamp; } - virtual cl::Event get() = 0; + virtual cl::Event& get() = 0; protected: uint64_t _queue_stamp = 0; diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.hpp index d0a1a8573f7..1c2235da214 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.hpp @@ -20,7 +20,7 @@ public: : ocl_base_event(queue_stamp) , _event(ev) {} - cl::Event get() override { return _event; } + cl::Event& get() override { return _event; } private: bool _callback_set = false; @@ -47,7 +47,7 @@ public: process_events(ev); } - cl::Event get() override { return _last_ocl_event; } + cl::Event& get() override { return _last_ocl_event; } void reset() override { event::reset(); diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp index e20a237b6ad..0963e1b5f4f 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp @@ -70,7 +70,7 @@ event::ptr gpu_buffer::fill(stream& stream) { event::ptr gpu_buffer::fill(stream& stream, unsigned char pattern) { auto& cl_stream = downcast(stream); auto ev = stream.create_base_event(); - cl::Event ev_ocl = std::dynamic_pointer_cast(ev)->get(); + cl::Event& ev_ocl = downcast(ev.get())->get(); cl_stream.get_cl_queue().enqueueFillBuffer(_buffer, pattern, 0, size(), nullptr, &ev_ocl); // TODO: do we need sync here? @@ -91,30 +91,33 @@ shared_mem_params gpu_buffer::get_internal_params() const { 0}; } -event::ptr gpu_buffer::copy_from(stream& stream, const memory& other) { +event::ptr gpu_buffer::copy_from(stream& stream, const memory& other, bool blocking) { auto& cl_stream = downcast(stream); auto& mem_inst = downcast(other); auto ev = stream.create_base_event(); - cl::Event ev_ocl = std::dynamic_pointer_cast(ev)->get(); + cl::Event& ev_ocl = downcast(ev.get())->get(); cl_stream.get_cl_queue().enqueueCopyBuffer(mem_inst.get_buffer(), get_buffer(), 0, 0, other.size(), nullptr, &ev_ocl); - return ev; -} - -event::ptr gpu_buffer::copy_from(stream& stream, const void* host_ptr) { - auto& cl_stream = downcast(stream); - auto ev = stream.create_base_event(); - cl::Event ev_ocl = std::dynamic_pointer_cast(ev)->get(); - cl_stream.get_cl_queue().enqueueWriteBuffer(_buffer, false, 0, size(), host_ptr, nullptr, &ev_ocl); + if (blocking) + ev->wait(); return ev; } -event::ptr gpu_buffer::copy_to(stream& stream, void* host_ptr) { +event::ptr gpu_buffer::copy_from(stream& stream, const void* host_ptr, bool blocking) { auto& cl_stream = downcast(stream); - auto ev = stream.create_base_event(); - cl::Event ev_ocl = downcast(ev.get())->get(); - cl_stream.get_cl_queue().enqueueReadBuffer(_buffer, false, 0, size(), host_ptr, nullptr, &ev_ocl); + auto ev = blocking ? stream.create_user_event(true) : stream.create_base_event(); + cl::Event* ev_ocl = blocking ? nullptr : &downcast(ev.get())->get(); + cl_stream.get_cl_queue().enqueueWriteBuffer(_buffer, blocking, 0, size(), host_ptr, nullptr, ev_ocl); + + return ev; +} + +event::ptr gpu_buffer::copy_to(stream& stream, void* host_ptr, bool blocking) { + auto& cl_stream = downcast(stream); + auto ev = blocking ? stream.create_user_event(true) : stream.create_base_event(); + cl::Event* ev_ocl = blocking ? nullptr : &downcast(ev.get())->get(); + cl_stream.get_cl_queue().enqueueReadBuffer(_buffer, blocking, 0, size(), host_ptr, nullptr, ev_ocl); return ev; } @@ -195,7 +198,7 @@ event::ptr gpu_image2d::fill(stream& stream) { event::ptr gpu_image2d::fill(stream& stream, unsigned char pattern) { auto& cl_stream = downcast(stream); auto ev = stream.create_base_event(); - cl::Event ev_ocl = downcast(ev.get())->get(); + cl::Event& ev_ocl = downcast(ev.get())->get(); cl_uint4 pattern_uint4 = {pattern, pattern, pattern, pattern}; cl_stream.get_cl_queue().enqueueFillImage(_buffer, pattern_uint4, {0, 0, 0}, {_width, _height, 1}, 0, &ev_ocl); @@ -245,19 +248,19 @@ shared_mem_params gpu_image2d::get_internal_params() const { 0}; } -event::ptr gpu_image2d::copy_from(stream& /* stream */, const memory& /* other */) { +event::ptr gpu_image2d::copy_from(stream& /* stream */, const memory& /* other */, bool /* blocking */) { throw std::runtime_error("[GPU] copy_from is not implemented for gpu_image2d"); } -event::ptr gpu_image2d::copy_from(stream& /* stream */, const void* /* host_ptr */) { +event::ptr gpu_image2d::copy_from(stream& /* stream */, const void* /* host_ptr */, bool /* blocking */) { throw std::runtime_error("[GPU] copy_from is not implemented for gpu_image2d"); } -event::ptr gpu_image2d::copy_to(stream& /* stream */, memory& /* other */) { +event::ptr gpu_image2d::copy_to(stream& /* stream */, memory& /* other */, bool /* blocking */) { throw std::runtime_error("[GPU] copy_to is not implemented for gpu_image2d"); } -event::ptr gpu_image2d::copy_to(stream& /* stream */, void* /* host_ptr */) { +event::ptr gpu_image2d::copy_to(stream& /* stream */, void* /* host_ptr */, bool /* blocking */) { throw std::runtime_error("[GPU] copy_to is not implemented for gpu_image2d"); } @@ -364,7 +367,7 @@ void gpu_usm::unlock(const stream& /* stream */) { event::ptr gpu_usm::fill(stream& stream, unsigned char pattern) { auto& cl_stream = downcast(stream); auto ev = stream.create_base_event(); - cl::Event ev_ocl = downcast(ev.get())->get(); + cl::Event& ev_ocl = downcast(ev.get())->get(); // enqueueFillUsm call will never finish. Driver bug? Uncomment when fixed. Some older drivers doesn't support enqueueFillUsm call at all. // cl_stream.get_usm_helper().enqueue_fill_mem(cl_stream.get_cl_queue(), _buffer.get(), pattern, _bytes_count, nullptr, &ev_ocl) // Workarounded with enqeue_memcopy. ToDo: Remove below code. Uncomment above. @@ -386,39 +389,51 @@ event::ptr gpu_usm::fill(stream& stream) { return fill(stream, 0); } -event::ptr gpu_usm::copy_from(stream& stream, const memory& other) { +event::ptr gpu_usm::copy_from(stream& stream, const memory& other, bool blocking) { auto& cl_stream = downcast(stream); auto& casted = downcast(other); auto dst_ptr = get_buffer().get(); auto src_ptr = casted.get_buffer().get(); + auto ev = blocking ? stream.create_user_event(true) : stream.create_base_event(); + cl::Event* ev_ocl = blocking ? nullptr : &downcast(ev.get())->get(); cl_stream.get_usm_helper().enqueue_memcpy(cl_stream.get_cl_queue(), dst_ptr, src_ptr, _bytes_count, - true); - return stream.create_user_event(true); + blocking, + nullptr, + ev_ocl); + return ev; } -event::ptr gpu_usm::copy_from(stream& stream, const void* host_ptr) { +event::ptr gpu_usm::copy_from(stream& stream, const void* host_ptr, bool blocking) { auto& cl_stream = downcast(stream); auto dst_ptr = get_buffer().get(); + auto ev = blocking ? stream.create_user_event(true) : stream.create_base_event(); + cl::Event* ev_ocl = blocking ? nullptr : &downcast(ev.get())->get(); cl_stream.get_usm_helper().enqueue_memcpy(cl_stream.get_cl_queue(), dst_ptr, host_ptr, _bytes_count, - true); + blocking, + nullptr, + ev_ocl); return stream.create_user_event(true); } -event::ptr gpu_usm::copy_to(stream& stream, void* host_ptr) { +event::ptr gpu_usm::copy_to(stream& stream, void* host_ptr, bool blocking) { auto& cl_stream = downcast(stream); + auto ev = blocking ? stream.create_user_event(true) : stream.create_base_event(); + cl::Event* ev_ocl = blocking ? nullptr : &downcast(ev.get())->get(); auto src_ptr = get_buffer().get(); cl_stream.get_usm_helper().enqueue_memcpy(cl_stream.get_cl_queue(), host_ptr, src_ptr, _bytes_count, - true); - return stream.create_user_event(true); + blocking, + nullptr, + ev_ocl); + return ev; } #ifdef ENABLE_ONEDNN_FOR_GPU diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp index fc9e40b64d5..661f114f454 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp @@ -40,10 +40,10 @@ struct gpu_buffer : public lockable_gpu_mem, public memory { return _buffer; } - event::ptr copy_from(stream& stream, const memory& other) override; - event::ptr copy_from(stream& stream, const void* host_ptr) override; + event::ptr copy_from(stream& stream, const memory& other, bool blocking) override; + event::ptr copy_from(stream& stream, const void* host_ptr, bool blocking) override; - event::ptr copy_to(stream& /* stream */, void* /* other */) override; + event::ptr copy_to(stream& stream, void* other , bool blocking) override; #ifdef ENABLE_ONEDNN_FOR_GPU dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) override; @@ -67,11 +67,11 @@ struct gpu_image2d : public lockable_gpu_mem, public memory { return _buffer; } - event::ptr copy_from(stream& /* stream */, const memory& /* other */) override; - event::ptr copy_from(stream& /* stream */, const void* /* other */) override; + event::ptr copy_from(stream& /* stream */, const memory& /* other */, bool /* blocking */) override; + event::ptr copy_from(stream& /* stream */, const void* /* other */, bool /* blocking */) override; - event::ptr copy_to(stream& /* stream */, memory& /* other */) override; - event::ptr copy_to(stream& /* stream */, void* /* other */) override; + event::ptr copy_to(stream& /* stream */, memory& /* other */, bool /* blocking */) override; + event::ptr copy_to(stream& /* stream */, void* /* other */, bool /* blocking */) override; protected: cl::Image2D _buffer; @@ -119,10 +119,10 @@ struct gpu_usm : public lockable_gpu_mem, public memory { event::ptr fill(stream& stream) override; shared_mem_params get_internal_params() const override; - event::ptr copy_from(stream& stream, const memory& other) override; - event::ptr copy_from(stream& stream, const void* host_ptr) override; + event::ptr copy_from(stream& stream, const memory& other, bool blocking) override; + event::ptr copy_from(stream& stream, const void* host_ptr, bool blocking) override; - event::ptr copy_to(stream& stream, void* host_ptr) override; + event::ptr copy_to(stream& stream, void* host_ptr, bool blocking) override; #ifdef ENABLE_ONEDNN_FOR_GPU dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) override; #endif diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_user_event.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_user_event.hpp index 781f5b28301..abc8e82385c 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_user_event.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_user_event.hpp @@ -30,7 +30,7 @@ struct ocl_user_event : public ocl_base_event { void set_impl() override; bool get_profiling_info_impl(std::list& info) override; - cl::Event get() override { return _event; }; + cl::Event& get() override { return _event; }; protected: cldnn::instrumentation::timer<> _timer; diff --git a/src/plugins/intel_gpu/tests/module_tests/usm_memory_test.cpp b/src/plugins/intel_gpu/tests/module_tests/usm_memory_test.cpp index f40aecf01f1..3c52be6f9a3 100644 --- a/src/plugins/intel_gpu/tests/module_tests/usm_memory_test.cpp +++ b/src/plugins/intel_gpu/tests/module_tests/usm_memory_test.cpp @@ -135,7 +135,7 @@ TEST_P(copy_and_read_buffer, basic) { cldnn::mem_lock lock(host_buf, stream); std::copy(src_buffer.begin(), src_buffer.end(), lock.data()); } - casted->copy_from(stream, *host_buf); + casted->copy_from(stream, *host_buf, true); break; } default: