[GPU] Add blocking option for cldnn::memory::copy_*() functions (#14012)

This commit is contained in:
Sergey Shlyapnikov 2022-11-24 10:37:11 +04:00 committed by GitHub
parent cc219d085e
commit d464d079b2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 70 additions and 57 deletions

View File

@ -68,11 +68,11 @@ struct memory {
return true; return true;
} }
virtual event::ptr copy_from(stream& /* stream */, const memory& /* other */) = 0; virtual event::ptr copy_from(stream& /* stream */, const memory& /* other */, bool blocking = true) = 0;
virtual event::ptr copy_from(stream& /* stream */, const void* /* host_ptr */) = 0; virtual event::ptr copy_from(stream& /* stream */, const void* /* host_ptr */, bool blocking = true) = 0;
virtual event::ptr copy_to(stream& stream, memory& other) { return other.copy_from(stream, *this); } virtual event::ptr copy_to(stream& stream, memory& other, bool blocking = true) { return other.copy_from(stream, *this, blocking); }
virtual event::ptr copy_to(stream& /* stream */, void* /* host_ptr */) = 0; virtual event::ptr copy_to(stream& /* stream */, void* /* host_ptr */, bool blocking = true) = 0;
#ifdef ENABLE_ONEDNN_FOR_GPU #ifdef ENABLE_ONEDNN_FOR_GPU
virtual dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) { virtual dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) {
@ -108,11 +108,11 @@ struct simple_attached_memory : memory {
#endif #endif
0}; }; 0}; };
event::ptr copy_from(stream& /* stream */, const memory& /* other */) override { return nullptr; }; event::ptr copy_from(stream& /* stream */, const memory& /* other */, bool /* blocking */) override { return nullptr; };
event::ptr copy_from(stream& /* stream */, const void* /* host_ptr */) override { return nullptr; } event::ptr copy_from(stream& /* stream */, const void* /* host_ptr */, bool /* blocking */) override { return nullptr; }
event::ptr copy_to(stream& /* stream */, memory& /* other */) override { return nullptr; }; event::ptr copy_to(stream& /* stream */, memory& /* other */, bool /* blocking */) override { return nullptr; };
event::ptr copy_to(stream& /* stream */, void* /* host_ptr */) override { return nullptr; } event::ptr copy_to(stream& /* stream */, void* /* host_ptr */, bool /* blocking */) override { return nullptr; }
private: private:
void* _pointer; void* _pointer;

View File

@ -647,8 +647,7 @@ void InferRequest::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst) {
OPENVINO_ASSERT(intermediate_output_blob, "[GPU] Intermediate blob for outputs precessing is not allocated"); OPENVINO_ASSERT(intermediate_output_blob, "[GPU] Intermediate blob for outputs precessing is not allocated");
auto event = src->copy_to(stream, intermediate_output_blob->buffer()); src->copy_to(stream, intermediate_output_blob->buffer());
event->wait();
switch (dst->getTensorDesc().getPrecision()) { switch (dst->getTensorDesc().getPrecision()) {
#define CASE(PRC, SRC_DT, DST_DT) \ #define CASE(PRC, SRC_DT, DST_DT) \
@ -670,8 +669,7 @@ void InferRequest::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst) {
} }
} else { } else {
auto dst_ptr = dst->buffer().as<void*>(); auto dst_ptr = dst->buffer().as<void*>();
auto event = src->copy_to(stream, dst_ptr); src->copy_to(stream, dst_ptr);
event->wait();
} }
} }
@ -959,7 +957,7 @@ void InferRequest::prepare_input(const cldnn::primitive_id& inputName, Blob::Ptr
auto src_lock = inputBlob->cbuffer(); auto src_lock = inputBlob->cbuffer();
auto src_ptr = src_lock.as<uint8_t*>(); auto src_ptr = src_lock.as<uint8_t*>();
if (!same_host_mem(inputMem, src_ptr)) { if (!same_host_mem(inputMem, src_ptr)) {
auto ev = inputMem->copy_from(stream, src_ptr); auto ev = inputMem->copy_from(stream, src_ptr, false);
dependencies.push_back(ev); dependencies.push_back(ev);
} }
} }

View File

@ -25,7 +25,7 @@ struct ocl_base_event : public event {
public: public:
explicit ocl_base_event(uint64_t queue_stamp = 0) : event(), _queue_stamp(queue_stamp) { } explicit ocl_base_event(uint64_t queue_stamp = 0) : event(), _queue_stamp(queue_stamp) { }
uint64_t get_queue_stamp() const { return _queue_stamp; } uint64_t get_queue_stamp() const { return _queue_stamp; }
virtual cl::Event get() = 0; virtual cl::Event& get() = 0;
protected: protected:
uint64_t _queue_stamp = 0; uint64_t _queue_stamp = 0;

View File

@ -20,7 +20,7 @@ public:
: ocl_base_event(queue_stamp) : ocl_base_event(queue_stamp)
, _event(ev) {} , _event(ev) {}
cl::Event get() override { return _event; } cl::Event& get() override { return _event; }
private: private:
bool _callback_set = false; bool _callback_set = false;
@ -47,7 +47,7 @@ public:
process_events(ev); process_events(ev);
} }
cl::Event get() override { return _last_ocl_event; } cl::Event& get() override { return _last_ocl_event; }
void reset() override { void reset() override {
event::reset(); event::reset();

View File

@ -70,7 +70,7 @@ event::ptr gpu_buffer::fill(stream& stream) {
event::ptr gpu_buffer::fill(stream& stream, unsigned char pattern) { event::ptr gpu_buffer::fill(stream& stream, unsigned char pattern) {
auto& cl_stream = downcast<ocl_stream>(stream); auto& cl_stream = downcast<ocl_stream>(stream);
auto ev = stream.create_base_event(); auto ev = stream.create_base_event();
cl::Event ev_ocl = std::dynamic_pointer_cast<ocl_event>(ev)->get(); cl::Event& ev_ocl = downcast<ocl_event>(ev.get())->get();
cl_stream.get_cl_queue().enqueueFillBuffer<unsigned char>(_buffer, pattern, 0, size(), nullptr, &ev_ocl); cl_stream.get_cl_queue().enqueueFillBuffer<unsigned char>(_buffer, pattern, 0, size(), nullptr, &ev_ocl);
// TODO: do we need sync here? // TODO: do we need sync here?
@ -91,30 +91,33 @@ shared_mem_params gpu_buffer::get_internal_params() const {
0}; 0};
} }
event::ptr gpu_buffer::copy_from(stream& stream, const memory& other) { event::ptr gpu_buffer::copy_from(stream& stream, const memory& other, bool blocking) {
auto& cl_stream = downcast<ocl_stream>(stream); auto& cl_stream = downcast<ocl_stream>(stream);
auto& mem_inst = downcast<const gpu_buffer>(other); auto& mem_inst = downcast<const gpu_buffer>(other);
auto ev = stream.create_base_event(); auto ev = stream.create_base_event();
cl::Event ev_ocl = std::dynamic_pointer_cast<ocl_event>(ev)->get(); cl::Event& ev_ocl = downcast<ocl_event>(ev.get())->get();
cl_stream.get_cl_queue().enqueueCopyBuffer(mem_inst.get_buffer(), get_buffer(), 0, 0, other.size(), nullptr, &ev_ocl); cl_stream.get_cl_queue().enqueueCopyBuffer(mem_inst.get_buffer(), get_buffer(), 0, 0, other.size(), nullptr, &ev_ocl);
return ev; if (blocking)
} ev->wait();
event::ptr gpu_buffer::copy_from(stream& stream, const void* host_ptr) {
auto& cl_stream = downcast<ocl_stream>(stream);
auto ev = stream.create_base_event();
cl::Event ev_ocl = std::dynamic_pointer_cast<ocl_event>(ev)->get();
cl_stream.get_cl_queue().enqueueWriteBuffer(_buffer, false, 0, size(), host_ptr, nullptr, &ev_ocl);
return ev; return ev;
} }
event::ptr gpu_buffer::copy_to(stream& stream, void* host_ptr) { event::ptr gpu_buffer::copy_from(stream& stream, const void* host_ptr, bool blocking) {
auto& cl_stream = downcast<ocl_stream>(stream); auto& cl_stream = downcast<ocl_stream>(stream);
auto ev = stream.create_base_event(); auto ev = blocking ? stream.create_user_event(true) : stream.create_base_event();
cl::Event ev_ocl = downcast<ocl_event>(ev.get())->get(); cl::Event* ev_ocl = blocking ? nullptr : &downcast<ocl_event>(ev.get())->get();
cl_stream.get_cl_queue().enqueueReadBuffer(_buffer, false, 0, size(), host_ptr, nullptr, &ev_ocl); cl_stream.get_cl_queue().enqueueWriteBuffer(_buffer, blocking, 0, size(), host_ptr, nullptr, ev_ocl);
return ev;
}
event::ptr gpu_buffer::copy_to(stream& stream, void* host_ptr, bool blocking) {
auto& cl_stream = downcast<ocl_stream>(stream);
auto ev = blocking ? stream.create_user_event(true) : stream.create_base_event();
cl::Event* ev_ocl = blocking ? nullptr : &downcast<ocl_event>(ev.get())->get();
cl_stream.get_cl_queue().enqueueReadBuffer(_buffer, blocking, 0, size(), host_ptr, nullptr, ev_ocl);
return ev; return ev;
} }
@ -195,7 +198,7 @@ event::ptr gpu_image2d::fill(stream& stream) {
event::ptr gpu_image2d::fill(stream& stream, unsigned char pattern) { event::ptr gpu_image2d::fill(stream& stream, unsigned char pattern) {
auto& cl_stream = downcast<ocl_stream>(stream); auto& cl_stream = downcast<ocl_stream>(stream);
auto ev = stream.create_base_event(); auto ev = stream.create_base_event();
cl::Event ev_ocl = downcast<ocl_event>(ev.get())->get(); cl::Event& ev_ocl = downcast<ocl_event>(ev.get())->get();
cl_uint4 pattern_uint4 = {pattern, pattern, pattern, pattern}; cl_uint4 pattern_uint4 = {pattern, pattern, pattern, pattern};
cl_stream.get_cl_queue().enqueueFillImage(_buffer, pattern_uint4, {0, 0, 0}, {_width, _height, 1}, 0, &ev_ocl); cl_stream.get_cl_queue().enqueueFillImage(_buffer, pattern_uint4, {0, 0, 0}, {_width, _height, 1}, 0, &ev_ocl);
@ -245,19 +248,19 @@ shared_mem_params gpu_image2d::get_internal_params() const {
0}; 0};
} }
event::ptr gpu_image2d::copy_from(stream& /* stream */, const memory& /* other */) { event::ptr gpu_image2d::copy_from(stream& /* stream */, const memory& /* other */, bool /* blocking */) {
throw std::runtime_error("[GPU] copy_from is not implemented for gpu_image2d"); throw std::runtime_error("[GPU] copy_from is not implemented for gpu_image2d");
} }
event::ptr gpu_image2d::copy_from(stream& /* stream */, const void* /* host_ptr */) { event::ptr gpu_image2d::copy_from(stream& /* stream */, const void* /* host_ptr */, bool /* blocking */) {
throw std::runtime_error("[GPU] copy_from is not implemented for gpu_image2d"); throw std::runtime_error("[GPU] copy_from is not implemented for gpu_image2d");
} }
event::ptr gpu_image2d::copy_to(stream& /* stream */, memory& /* other */) { event::ptr gpu_image2d::copy_to(stream& /* stream */, memory& /* other */, bool /* blocking */) {
throw std::runtime_error("[GPU] copy_to is not implemented for gpu_image2d"); throw std::runtime_error("[GPU] copy_to is not implemented for gpu_image2d");
} }
event::ptr gpu_image2d::copy_to(stream& /* stream */, void* /* host_ptr */) { event::ptr gpu_image2d::copy_to(stream& /* stream */, void* /* host_ptr */, bool /* blocking */) {
throw std::runtime_error("[GPU] copy_to is not implemented for gpu_image2d"); throw std::runtime_error("[GPU] copy_to is not implemented for gpu_image2d");
} }
@ -364,7 +367,7 @@ void gpu_usm::unlock(const stream& /* stream */) {
event::ptr gpu_usm::fill(stream& stream, unsigned char pattern) { event::ptr gpu_usm::fill(stream& stream, unsigned char pattern) {
auto& cl_stream = downcast<ocl_stream>(stream); auto& cl_stream = downcast<ocl_stream>(stream);
auto ev = stream.create_base_event(); auto ev = stream.create_base_event();
cl::Event ev_ocl = downcast<ocl_event>(ev.get())->get(); cl::Event& ev_ocl = downcast<ocl_event>(ev.get())->get();
// enqueueFillUsm call will never finish. Driver bug? Uncomment when fixed. Some older drivers doesn't support enqueueFillUsm call at all. // enqueueFillUsm call will never finish. Driver bug? Uncomment when fixed. Some older drivers doesn't support enqueueFillUsm call at all.
// cl_stream.get_usm_helper().enqueue_fill_mem<unsigned char>(cl_stream.get_cl_queue(), _buffer.get(), pattern, _bytes_count, nullptr, &ev_ocl) // cl_stream.get_usm_helper().enqueue_fill_mem<unsigned char>(cl_stream.get_cl_queue(), _buffer.get(), pattern, _bytes_count, nullptr, &ev_ocl)
// Workarounded with enqeue_memcopy. ToDo: Remove below code. Uncomment above. // Workarounded with enqeue_memcopy. ToDo: Remove below code. Uncomment above.
@ -386,39 +389,51 @@ event::ptr gpu_usm::fill(stream& stream) {
return fill(stream, 0); return fill(stream, 0);
} }
event::ptr gpu_usm::copy_from(stream& stream, const memory& other) { event::ptr gpu_usm::copy_from(stream& stream, const memory& other, bool blocking) {
auto& cl_stream = downcast<const ocl_stream>(stream); auto& cl_stream = downcast<const ocl_stream>(stream);
auto& casted = downcast<const gpu_usm>(other); auto& casted = downcast<const gpu_usm>(other);
auto dst_ptr = get_buffer().get(); auto dst_ptr = get_buffer().get();
auto src_ptr = casted.get_buffer().get(); auto src_ptr = casted.get_buffer().get();
auto ev = blocking ? stream.create_user_event(true) : stream.create_base_event();
cl::Event* ev_ocl = blocking ? nullptr : &downcast<ocl_event>(ev.get())->get();
cl_stream.get_usm_helper().enqueue_memcpy(cl_stream.get_cl_queue(), cl_stream.get_usm_helper().enqueue_memcpy(cl_stream.get_cl_queue(),
dst_ptr, dst_ptr,
src_ptr, src_ptr,
_bytes_count, _bytes_count,
true); blocking,
return stream.create_user_event(true); nullptr,
ev_ocl);
return ev;
} }
event::ptr gpu_usm::copy_from(stream& stream, const void* host_ptr) { event::ptr gpu_usm::copy_from(stream& stream, const void* host_ptr, bool blocking) {
auto& cl_stream = downcast<ocl_stream>(stream); auto& cl_stream = downcast<ocl_stream>(stream);
auto dst_ptr = get_buffer().get(); auto dst_ptr = get_buffer().get();
auto ev = blocking ? stream.create_user_event(true) : stream.create_base_event();
cl::Event* ev_ocl = blocking ? nullptr : &downcast<ocl_event>(ev.get())->get();
cl_stream.get_usm_helper().enqueue_memcpy(cl_stream.get_cl_queue(), cl_stream.get_usm_helper().enqueue_memcpy(cl_stream.get_cl_queue(),
dst_ptr, dst_ptr,
host_ptr, host_ptr,
_bytes_count, _bytes_count,
true); blocking,
nullptr,
ev_ocl);
return stream.create_user_event(true); return stream.create_user_event(true);
} }
event::ptr gpu_usm::copy_to(stream& stream, void* host_ptr) { event::ptr gpu_usm::copy_to(stream& stream, void* host_ptr, bool blocking) {
auto& cl_stream = downcast<ocl_stream>(stream); auto& cl_stream = downcast<ocl_stream>(stream);
auto ev = blocking ? stream.create_user_event(true) : stream.create_base_event();
cl::Event* ev_ocl = blocking ? nullptr : &downcast<ocl_event>(ev.get())->get();
auto src_ptr = get_buffer().get(); auto src_ptr = get_buffer().get();
cl_stream.get_usm_helper().enqueue_memcpy(cl_stream.get_cl_queue(), cl_stream.get_usm_helper().enqueue_memcpy(cl_stream.get_cl_queue(),
host_ptr, host_ptr,
src_ptr, src_ptr,
_bytes_count, _bytes_count,
true); blocking,
return stream.create_user_event(true); nullptr,
ev_ocl);
return ev;
} }
#ifdef ENABLE_ONEDNN_FOR_GPU #ifdef ENABLE_ONEDNN_FOR_GPU

View File

@ -40,10 +40,10 @@ struct gpu_buffer : public lockable_gpu_mem, public memory {
return _buffer; return _buffer;
} }
event::ptr copy_from(stream& stream, const memory& other) override; event::ptr copy_from(stream& stream, const memory& other, bool blocking) override;
event::ptr copy_from(stream& stream, const void* host_ptr) override; event::ptr copy_from(stream& stream, const void* host_ptr, bool blocking) override;
event::ptr copy_to(stream& /* stream */, void* /* other */) override; event::ptr copy_to(stream& stream, void* other , bool blocking) override;
#ifdef ENABLE_ONEDNN_FOR_GPU #ifdef ENABLE_ONEDNN_FOR_GPU
dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) override; dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) override;
@ -67,11 +67,11 @@ struct gpu_image2d : public lockable_gpu_mem, public memory {
return _buffer; return _buffer;
} }
event::ptr copy_from(stream& /* stream */, const memory& /* other */) override; event::ptr copy_from(stream& /* stream */, const memory& /* other */, bool /* blocking */) override;
event::ptr copy_from(stream& /* stream */, const void* /* other */) override; event::ptr copy_from(stream& /* stream */, const void* /* other */, bool /* blocking */) override;
event::ptr copy_to(stream& /* stream */, memory& /* other */) override; event::ptr copy_to(stream& /* stream */, memory& /* other */, bool /* blocking */) override;
event::ptr copy_to(stream& /* stream */, void* /* other */) override; event::ptr copy_to(stream& /* stream */, void* /* other */, bool /* blocking */) override;
protected: protected:
cl::Image2D _buffer; cl::Image2D _buffer;
@ -119,10 +119,10 @@ struct gpu_usm : public lockable_gpu_mem, public memory {
event::ptr fill(stream& stream) override; event::ptr fill(stream& stream) override;
shared_mem_params get_internal_params() const override; shared_mem_params get_internal_params() const override;
event::ptr copy_from(stream& stream, const memory& other) override; event::ptr copy_from(stream& stream, const memory& other, bool blocking) override;
event::ptr copy_from(stream& stream, const void* host_ptr) override; event::ptr copy_from(stream& stream, const void* host_ptr, bool blocking) override;
event::ptr copy_to(stream& stream, void* host_ptr) override; event::ptr copy_to(stream& stream, void* host_ptr, bool blocking) override;
#ifdef ENABLE_ONEDNN_FOR_GPU #ifdef ENABLE_ONEDNN_FOR_GPU
dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) override; dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) override;
#endif #endif

View File

@ -30,7 +30,7 @@ struct ocl_user_event : public ocl_base_event {
void set_impl() override; void set_impl() override;
bool get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) override; bool get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) override;
cl::Event get() override { return _event; }; cl::Event& get() override { return _event; };
protected: protected:
cldnn::instrumentation::timer<> _timer; cldnn::instrumentation::timer<> _timer;

View File

@ -135,7 +135,7 @@ TEST_P(copy_and_read_buffer, basic) {
cldnn::mem_lock<float> lock(host_buf, stream); cldnn::mem_lock<float> lock(host_buf, stream);
std::copy(src_buffer.begin(), src_buffer.end(), lock.data()); std::copy(src_buffer.begin(), src_buffer.end(), lock.data());
} }
casted->copy_from(stream, *host_buf); casted->copy_from(stream, *host_buf, true);
break; break;
} }
default: default: