[IE CLDNN] Another try to fix multiple-kernel implementations profiling (#2630)
This commit is contained in:
parent
3688ff4c51
commit
458425ac9e
@ -102,59 +102,90 @@ bool base_event::get_profiling_info_impl(std::list<instrumentation::profiling_in
|
|||||||
}
|
}
|
||||||
|
|
||||||
void base_events::wait_impl() {
|
void base_events::wait_impl() {
|
||||||
if (!_events.empty()) {
|
if (_last_ocl_event.get() != nullptr) {
|
||||||
for (size_t i = 0; i < _events.size(); i++) {
|
_last_ocl_event.wait();
|
||||||
_events[i]->wait();
|
if (get_context()->logging_enabled()) {
|
||||||
|
get_context()->log(0, "Wait for event: " + std::to_string(_queue_stamp));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool base_events::is_set_impl() {
|
bool base_events::is_set_impl() {
|
||||||
if (!_events.empty()) {
|
if (_last_ocl_event.get() != nullptr) {
|
||||||
for (size_t i = 0; i < _events.size(); i++) {
|
return _last_ocl_event.getInfo<CL_EVENT_COMMAND_EXECUTION_STATUS>() == CL_COMPLETE;
|
||||||
if (!_events[i]->is_set())
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool base_events::get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) {
|
bool base_events::get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) {
|
||||||
cl_ulong min_queue = CL_ULONG_MAX;
|
|
||||||
cl_ulong min_sub = CL_ULONG_MAX;
|
// For every profiling period (i.e. submission / starting / executing),
|
||||||
cl_ulong min_start = CL_ULONG_MAX;
|
// the goal is to sum up all disjoint durations of its projection on the time axis
|
||||||
uint64_t execution_time = 0;
|
|
||||||
|
std::map<std::string, std::vector<std::pair<unsigned long long, unsigned long long>>> all_durations;
|
||||||
|
|
||||||
for (size_t i = 0; i < _events.size(); i++) {
|
for (size_t i = 0; i < _events.size(); i++) {
|
||||||
auto be = dynamic_cast<base_event*>(_events[i].get());
|
auto be = dynamic_cast<base_event*>(_events[i].get());
|
||||||
if (!is_event_profiled(be->_event))
|
if (!is_event_profiled(be->_event))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
cl_ulong curr_queue;
|
for (auto& period : profiling_periods) {
|
||||||
cl_ulong curr_sub;
|
cl_ulong ev_start;
|
||||||
cl_ulong curr_start;
|
cl_ulong ev_end;
|
||||||
cl_ulong curr_end;
|
be->_event.getProfilingInfo(period.start, &ev_start);
|
||||||
be->_event.getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &curr_queue);
|
be->_event.getProfilingInfo(period.stop, &ev_end);
|
||||||
be->_event.getProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, &curr_sub);
|
auto ev_duration = std::make_pair(static_cast<unsigned long long>(ev_start),
|
||||||
be->_event.getProfilingInfo(CL_PROFILING_COMMAND_START, &curr_start);
|
static_cast<unsigned long long>(ev_end));
|
||||||
be->_event.getProfilingInfo(CL_PROFILING_COMMAND_END, &curr_end);
|
|
||||||
|
|
||||||
if (curr_queue < min_queue)
|
auto& durations = all_durations[period.name];
|
||||||
min_queue = curr_queue;
|
bool ev_duration_merged = false;
|
||||||
|
auto it = durations.begin();
|
||||||
|
|
||||||
if (curr_sub < min_sub)
|
while (it != durations.end()) {
|
||||||
min_sub = curr_sub;
|
auto& duration = *it;
|
||||||
|
if ((duration.second >= ev_duration.first) && (duration.first <= ev_duration.second)) {
|
||||||
|
if ((duration.first == ev_duration.first) && (duration.second == ev_duration.second)) {
|
||||||
|
if (!ev_duration_merged) {
|
||||||
|
ev_duration_merged = true;
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
it = durations.erase(it);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (!ev_duration_merged) {
|
||||||
|
duration.first = std::min(duration.first, ev_duration.first);
|
||||||
|
duration.second = std::max(duration.second, ev_duration.second);
|
||||||
|
ev_duration = duration;
|
||||||
|
ev_duration_merged = true;
|
||||||
|
it++;
|
||||||
|
} else {
|
||||||
|
if (duration.second > ev_duration.second) {
|
||||||
|
ev_duration.second = duration.second;
|
||||||
|
it--;
|
||||||
|
it->second = ev_duration.second;
|
||||||
|
it++;
|
||||||
|
}
|
||||||
|
it = durations.erase(it);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
it++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (curr_start < min_start)
|
if (!ev_duration_merged) {
|
||||||
min_start = curr_start;
|
durations.insert(it, ev_duration);
|
||||||
|
}
|
||||||
execution_time += curr_end - curr_start;
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
info.push_back(get_profiling_interval(profiling_periods[0].name, min_sub, min_queue));
|
for (auto& period : profiling_periods) {
|
||||||
info.push_back(get_profiling_interval(profiling_periods[1].name, min_start, min_sub));
|
unsigned long long sum = 0;
|
||||||
info.push_back(get_profiling_interval(profiling_periods[2].name, 0, execution_time));
|
for (auto& duration : all_durations[period.name]) {
|
||||||
|
sum += (duration.second - duration.first);
|
||||||
|
}
|
||||||
|
info.push_back(get_profiling_interval(period.name, 0, sum));
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -34,6 +34,7 @@ struct ocl_base_event : virtual public event_impl {
|
|||||||
public:
|
public:
|
||||||
explicit ocl_base_event(uint64_t queue_stamp = 0, bool valid = false) : _queue_stamp(queue_stamp) { _attached = valid; }
|
explicit ocl_base_event(uint64_t queue_stamp = 0, bool valid = false) : _queue_stamp(queue_stamp) { _attached = valid; }
|
||||||
uint64_t get_queue_stamp() const { return _queue_stamp; }
|
uint64_t get_queue_stamp() const { return _queue_stamp; }
|
||||||
|
virtual cl::Event get() = 0;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
uint64_t _queue_stamp = 0;
|
uint64_t _queue_stamp = 0;
|
||||||
@ -77,8 +78,8 @@ protected:
|
|||||||
struct base_events : virtual public ocl_base_event {
|
struct base_events : virtual public ocl_base_event {
|
||||||
public:
|
public:
|
||||||
base_events(std::shared_ptr<gpu_toolkit> ctx, std::vector<event_impl::ptr> const& ev)
|
base_events(std::shared_ptr<gpu_toolkit> ctx, std::vector<event_impl::ptr> const& ev)
|
||||||
: ocl_base_event(0, true), _ctx(ctx), _events(ev) {
|
: ocl_base_event(0, true), _ctx(ctx) {
|
||||||
set_queue_stamp();
|
process_events(ev);
|
||||||
}
|
}
|
||||||
|
|
||||||
explicit base_events(std::shared_ptr<gpu_toolkit> ctx) : ocl_base_event(0, false), _ctx(ctx) {}
|
explicit base_events(std::shared_ptr<gpu_toolkit> ctx) : ocl_base_event(0, false), _ctx(ctx) {}
|
||||||
@ -86,28 +87,47 @@ public:
|
|||||||
void attach_events(const std::vector<event_impl::ptr>& ev) {
|
void attach_events(const std::vector<event_impl::ptr>& ev) {
|
||||||
if (_attached)
|
if (_attached)
|
||||||
throw std::runtime_error("Trying to attach events to valid event object.");
|
throw std::runtime_error("Trying to attach events to valid event object.");
|
||||||
_events = ev;
|
process_events(ev);
|
||||||
_attached = true;
|
_attached = true;
|
||||||
set_queue_stamp();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cl::Event get() { return _last_ocl_event; }
|
||||||
std::shared_ptr<gpu_toolkit> get_context() const { return _ctx; }
|
std::shared_ptr<gpu_toolkit> get_context() const { return _ctx; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void set_queue_stamp() {
|
|
||||||
uint64_t _queue_stamp_max = 0;
|
|
||||||
for (size_t i = 0; i < _events.size(); i++) {
|
|
||||||
auto* _base_event = dynamic_cast<base_event*>(_events[i].get());
|
|
||||||
if (_base_event->get_queue_stamp() > _queue_stamp_max)
|
|
||||||
_queue_stamp_max = _base_event->get_queue_stamp();
|
|
||||||
}
|
|
||||||
_queue_stamp = _queue_stamp_max;
|
|
||||||
}
|
|
||||||
void wait_impl() override;
|
void wait_impl() override;
|
||||||
bool is_set_impl() override;
|
bool is_set_impl() override;
|
||||||
|
|
||||||
|
void process_events(const std::vector<event_impl::ptr>& ev) {
|
||||||
|
for (size_t i = 0; i < ev.size(); i++) {
|
||||||
|
auto multiple_events = dynamic_cast<base_events*>(ev[i].get());
|
||||||
|
if (multiple_events) {
|
||||||
|
for (size_t j = 0; j < multiple_events->_events.size(); j++) {
|
||||||
|
if (auto base_ev = dynamic_cast<base_event*>(multiple_events->_events[j].get())) {
|
||||||
|
auto current_ev_queue_stamp = base_ev->get_queue_stamp();
|
||||||
|
if ((_queue_stamp == 0) || (current_ev_queue_stamp > _queue_stamp)) {
|
||||||
|
_queue_stamp = current_ev_queue_stamp;
|
||||||
|
_last_ocl_event = base_ev->get();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_events.push_back(multiple_events->_events[j]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (auto base_ev = dynamic_cast<base_event*>(ev[i].get())) {
|
||||||
|
auto current_ev_queue_stamp = base_ev->get_queue_stamp();
|
||||||
|
if ((_queue_stamp == 0) || (current_ev_queue_stamp > _queue_stamp)) {
|
||||||
|
_queue_stamp = current_ev_queue_stamp;
|
||||||
|
_last_ocl_event = base_ev->get();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_events.push_back(ev[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) override;
|
bool get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) override;
|
||||||
|
|
||||||
|
cl::Event _last_ocl_event;
|
||||||
std::shared_ptr<gpu_toolkit> _ctx;
|
std::shared_ptr<gpu_toolkit> _ctx;
|
||||||
std::vector<event_impl::ptr> _events;
|
std::vector<event_impl::ptr> _events;
|
||||||
};
|
};
|
||||||
|
@ -79,9 +79,11 @@ event_impl::ptr gpu_queue::enqueue_kernel(kernels_cache::kernel_type const& kern
|
|||||||
std::vector<cl::Event> dep_events;
|
std::vector<cl::Event> dep_events;
|
||||||
auto dep_events_ptr = &dep_events;
|
auto dep_events_ptr = &dep_events;
|
||||||
if (!context()->get_configuration().host_out_of_order) {
|
if (!context()->get_configuration().host_out_of_order) {
|
||||||
for (auto& dep : deps)
|
for (auto& dep : deps) {
|
||||||
if (auto ocl_ev = dynamic_cast<base_event*>(dep.get()))
|
if (auto ocl_base_ev = dynamic_cast<ocl_base_event*>(dep.get())) {
|
||||||
dep_events.push_back(ocl_ev->get());
|
dep_events.push_back(ocl_base_ev->get());
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
dep_events_ptr = nullptr;
|
dep_events_ptr = nullptr;
|
||||||
|
|
||||||
@ -113,9 +115,10 @@ event_impl::ptr gpu_queue::enqueue_marker(std::vector<event_impl::ptr> const& de
|
|||||||
cl::Event ret_ev;
|
cl::Event ret_ev;
|
||||||
if (!enabled_single_kernel) {
|
if (!enabled_single_kernel) {
|
||||||
std::vector<cl::Event> dep_events;
|
std::vector<cl::Event> dep_events;
|
||||||
for (auto& dep : deps)
|
for (auto& dep : deps) {
|
||||||
if (auto ocl_ev = dynamic_cast<base_event*>(dep.get()))
|
if (auto ocl_base_ev = dynamic_cast<ocl_base_event*>(dep.get()))
|
||||||
dep_events.push_back(ocl_ev->get());
|
dep_events.push_back(ocl_base_ev->get());
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
_command_queue.enqueueMarkerWithWaitList(&dep_events, &ret_ev);
|
_command_queue.enqueueMarkerWithWaitList(&dep_events, &ret_ev);
|
||||||
@ -169,8 +172,8 @@ void gpu_queue::release_pending_memory() {
|
|||||||
void gpu_queue::sync_events(std::vector<event_impl::ptr> const& deps) {
|
void gpu_queue::sync_events(std::vector<event_impl::ptr> const& deps) {
|
||||||
bool needs_barrier = false;
|
bool needs_barrier = false;
|
||||||
for (auto& dep : deps) {
|
for (auto& dep : deps) {
|
||||||
auto* ocl_ev = dynamic_cast<ocl_base_event*>(dep.get());
|
auto* ocl_base_ev = dynamic_cast<ocl_base_event*>(dep.get());
|
||||||
if (ocl_ev->get_queue_stamp() > _last_barrier) {
|
if (ocl_base_ev->get_queue_stamp() > _last_barrier) {
|
||||||
needs_barrier = true;
|
needs_barrier = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -229,9 +229,10 @@ void gpu_toolkit::release_pending_memory(uint32_t queue_id) { get_command_queue(
|
|||||||
|
|
||||||
void gpu_toolkit::wait_for_events(std::vector<event_impl::ptr> const& events) {
|
void gpu_toolkit::wait_for_events(std::vector<event_impl::ptr> const& events) {
|
||||||
std::vector<cl::Event> clevents;
|
std::vector<cl::Event> clevents;
|
||||||
for (auto& ev : events)
|
for (auto& ev : events) {
|
||||||
if (auto ocl_ev = dynamic_cast<base_event*>(ev.get()))
|
if (auto ocl_base_ev = dynamic_cast<ocl_base_event*>(ev.get()))
|
||||||
clevents.push_back(ocl_ev->get());
|
clevents.push_back(ocl_base_ev->get());
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
cl::WaitForEvents(clevents);
|
cl::WaitForEvents(clevents);
|
||||||
|
@ -161,6 +161,7 @@ protected:
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<event_impl::ptr> tmp_events(events);
|
std::vector<event_impl::ptr> tmp_events(events);
|
||||||
|
std::vector<event_impl::ptr> all_events;
|
||||||
|
|
||||||
// TODO - split should be handle in kernel selector by providing multiple kernels.
|
// TODO - split should be handle in kernel selector by providing multiple kernels.
|
||||||
auto split = get_split();
|
auto split = get_split();
|
||||||
@ -181,13 +182,17 @@ protected:
|
|||||||
|
|
||||||
auto event = _kernels[k].run(net_id, _kernel_data.kernels[k], tmp_events);
|
auto event = _kernels[k].run(net_id, _kernel_data.kernels[k], tmp_events);
|
||||||
new_events.push_back(event);
|
new_events.push_back(event);
|
||||||
|
all_events.push_back(event);
|
||||||
}
|
}
|
||||||
|
|
||||||
tmp_events = new_events;
|
tmp_events = new_events;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool group_events = split > 1 ? true : false;
|
if ((all_events.size() == 0) && (tmp_events.size() > 0))
|
||||||
return aggregate_events(tmp_events, net_id, group_events);
|
return aggregate_events(tmp_events, net_id);
|
||||||
|
|
||||||
|
bool group_events = (all_events.size() > 1);
|
||||||
|
return aggregate_events(all_events, net_id, group_events);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user