diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.cpp b/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.cpp index 915eb79aea2..3e4ec0466b4 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.cpp @@ -102,59 +102,90 @@ bool base_event::get_profiling_info_impl(std::listwait(); + if (_last_ocl_event.get() != nullptr) { + _last_ocl_event.wait(); + if (get_context()->logging_enabled()) { + get_context()->log(0, "Wait for event: " + std::to_string(_queue_stamp)); } } } bool base_events::is_set_impl() { - if (!_events.empty()) { - for (size_t i = 0; i < _events.size(); i++) { - if (!_events[i]->is_set()) - return false; - } - return true; + if (_last_ocl_event.get() != nullptr) { + return _last_ocl_event.getInfo() == CL_COMPLETE; } return true; } bool base_events::get_profiling_info_impl(std::list& info) { - cl_ulong min_queue = CL_ULONG_MAX; - cl_ulong min_sub = CL_ULONG_MAX; - cl_ulong min_start = CL_ULONG_MAX; - uint64_t execution_time = 0; + + // For every profiling period (i.e. submission / starting / executing), + // the goal is to sum up all disjoint durations of its projection on the time axis + + std::map>> all_durations; for (size_t i = 0; i < _events.size(); i++) { auto be = dynamic_cast(_events[i].get()); if (!is_event_profiled(be->_event)) continue; - cl_ulong curr_queue; - cl_ulong curr_sub; - cl_ulong curr_start; - cl_ulong curr_end; - be->_event.getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &curr_queue); - be->_event.getProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, &curr_sub); - be->_event.getProfilingInfo(CL_PROFILING_COMMAND_START, &curr_start); - be->_event.getProfilingInfo(CL_PROFILING_COMMAND_END, &curr_end); + for (auto& period : profiling_periods) { + cl_ulong ev_start; + cl_ulong ev_end; + be->_event.getProfilingInfo(period.start, &ev_start); + be->_event.getProfilingInfo(period.stop, &ev_end); + auto ev_duration = std::make_pair(static_cast(ev_start), + static_cast(ev_end)); - if (curr_queue < min_queue) - min_queue = curr_queue; + auto& durations = all_durations[period.name]; + bool ev_duration_merged = false; + auto it = durations.begin(); - if (curr_sub < min_sub) - min_sub = curr_sub; + while (it != durations.end()) { + auto& duration = *it; + if ((duration.second >= ev_duration.first) && (duration.first <= ev_duration.second)) { + if ((duration.first == ev_duration.first) && (duration.second == ev_duration.second)) { + if (!ev_duration_merged) { + ev_duration_merged = true; + break; + } else { + it = durations.erase(it); + } + } else { + if (!ev_duration_merged) { + duration.first = std::min(duration.first, ev_duration.first); + duration.second = std::max(duration.second, ev_duration.second); + ev_duration = duration; + ev_duration_merged = true; + it++; + } else { + if (duration.second > ev_duration.second) { + ev_duration.second = duration.second; + it--; + it->second = ev_duration.second; + it++; + } + it = durations.erase(it); + } + } + } else { + it++; + } + } - if (curr_start < min_start) - min_start = curr_start; - - execution_time += curr_end - curr_start; + if (!ev_duration_merged) { + durations.insert(it, ev_duration); + } + } } - info.push_back(get_profiling_interval(profiling_periods[0].name, min_sub, min_queue)); - info.push_back(get_profiling_interval(profiling_periods[1].name, min_start, min_sub)); - info.push_back(get_profiling_interval(profiling_periods[2].name, 0, execution_time)); + for (auto& period : profiling_periods) { + unsigned long long sum = 0; + for (auto& duration : all_durations[period.name]) { + sum += (duration.second - duration.first); + } + info.push_back(get_profiling_interval(period.name, 0, sum)); + } return true; } diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.h b/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.h index 39e6f048e71..e449661e629 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.h +++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.h @@ -34,6 +34,7 @@ struct ocl_base_event : virtual public event_impl { public: explicit ocl_base_event(uint64_t queue_stamp = 0, bool valid = false) : _queue_stamp(queue_stamp) { _attached = valid; } uint64_t get_queue_stamp() const { return _queue_stamp; } + virtual cl::Event get() = 0; protected: uint64_t _queue_stamp = 0; @@ -77,8 +78,8 @@ protected: struct base_events : virtual public ocl_base_event { public: base_events(std::shared_ptr ctx, std::vector const& ev) - : ocl_base_event(0, true), _ctx(ctx), _events(ev) { - set_queue_stamp(); + : ocl_base_event(0, true), _ctx(ctx) { + process_events(ev); } explicit base_events(std::shared_ptr ctx) : ocl_base_event(0, false), _ctx(ctx) {} @@ -86,28 +87,47 @@ public: void attach_events(const std::vector& ev) { if (_attached) throw std::runtime_error("Trying to attach events to valid event object."); - _events = ev; + process_events(ev); _attached = true; - set_queue_stamp(); } + cl::Event get() { return _last_ocl_event; } std::shared_ptr get_context() const { return _ctx; } private: - void set_queue_stamp() { - uint64_t _queue_stamp_max = 0; - for (size_t i = 0; i < _events.size(); i++) { - auto* _base_event = dynamic_cast(_events[i].get()); - if (_base_event->get_queue_stamp() > _queue_stamp_max) - _queue_stamp_max = _base_event->get_queue_stamp(); - } - _queue_stamp = _queue_stamp_max; - } void wait_impl() override; bool is_set_impl() override; + void process_events(const std::vector& ev) { + for (size_t i = 0; i < ev.size(); i++) { + auto multiple_events = dynamic_cast(ev[i].get()); + if (multiple_events) { + for (size_t j = 0; j < multiple_events->_events.size(); j++) { + if (auto base_ev = dynamic_cast(multiple_events->_events[j].get())) { + auto current_ev_queue_stamp = base_ev->get_queue_stamp(); + if ((_queue_stamp == 0) || (current_ev_queue_stamp > _queue_stamp)) { + _queue_stamp = current_ev_queue_stamp; + _last_ocl_event = base_ev->get(); + } + } + _events.push_back(multiple_events->_events[j]); + } + } else { + if (auto base_ev = dynamic_cast(ev[i].get())) { + auto current_ev_queue_stamp = base_ev->get_queue_stamp(); + if ((_queue_stamp == 0) || (current_ev_queue_stamp > _queue_stamp)) { + _queue_stamp = current_ev_queue_stamp; + _last_ocl_event = base_ev->get(); + } + } + _events.push_back(ev[i]); + } + } + } + bool get_profiling_info_impl(std::list& info) override; + cl::Event _last_ocl_event; std::shared_ptr _ctx; std::vector _events; }; diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_queue_wrapper.cpp b/inference-engine/thirdparty/clDNN/src/gpu/ocl_queue_wrapper.cpp index 18bc5b12757..c06c914326c 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_queue_wrapper.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_queue_wrapper.cpp @@ -79,9 +79,11 @@ event_impl::ptr gpu_queue::enqueue_kernel(kernels_cache::kernel_type const& kern std::vector dep_events; auto dep_events_ptr = &dep_events; if (!context()->get_configuration().host_out_of_order) { - for (auto& dep : deps) - if (auto ocl_ev = dynamic_cast(dep.get())) - dep_events.push_back(ocl_ev->get()); + for (auto& dep : deps) { + if (auto ocl_base_ev = dynamic_cast(dep.get())) { + dep_events.push_back(ocl_base_ev->get()); + } + } } else { dep_events_ptr = nullptr; @@ -113,9 +115,10 @@ event_impl::ptr gpu_queue::enqueue_marker(std::vector const& de cl::Event ret_ev; if (!enabled_single_kernel) { std::vector dep_events; - for (auto& dep : deps) - if (auto ocl_ev = dynamic_cast(dep.get())) - dep_events.push_back(ocl_ev->get()); + for (auto& dep : deps) { + if (auto ocl_base_ev = dynamic_cast(dep.get())) + dep_events.push_back(ocl_base_ev->get()); + } try { _command_queue.enqueueMarkerWithWaitList(&dep_events, &ret_ev); @@ -169,8 +172,8 @@ void gpu_queue::release_pending_memory() { void gpu_queue::sync_events(std::vector const& deps) { bool needs_barrier = false; for (auto& dep : deps) { - auto* ocl_ev = dynamic_cast(dep.get()); - if (ocl_ev->get_queue_stamp() > _last_barrier) { + auto* ocl_base_ev = dynamic_cast(dep.get()); + if (ocl_base_ev->get_queue_stamp() > _last_barrier) { needs_barrier = true; } } diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp b/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp index 0d1f2d37e11..775fb922318 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp @@ -229,9 +229,10 @@ void gpu_toolkit::release_pending_memory(uint32_t queue_id) { get_command_queue( void gpu_toolkit::wait_for_events(std::vector const& events) { std::vector clevents; - for (auto& ev : events) - if (auto ocl_ev = dynamic_cast(ev.get())) - clevents.push_back(ocl_ev->get()); + for (auto& ev : events) { + if (auto ocl_base_ev = dynamic_cast(ev.get())) + clevents.push_back(ocl_base_ev->get()); + } try { cl::WaitForEvents(clevents); diff --git a/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.h b/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.h index 0c01641dbae..0a8a974e914 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.h +++ b/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.h @@ -161,6 +161,7 @@ protected: } std::vector tmp_events(events); + std::vector all_events; // TODO - split should be handle in kernel selector by providing multiple kernels. auto split = get_split(); @@ -181,13 +182,17 @@ protected: auto event = _kernels[k].run(net_id, _kernel_data.kernels[k], tmp_events); new_events.push_back(event); + all_events.push_back(event); } tmp_events = new_events; } - bool group_events = split > 1 ? true : false; - return aggregate_events(tmp_events, net_id, group_events); + if ((all_events.size() == 0) && (tmp_events.size() > 0)) + return aggregate_events(tmp_events, net_id); + + bool group_events = (all_events.size() > 1); + return aggregate_events(all_events, net_id, group_events); } };