[GPU] Profiling refactoring and fixes (#7714)
This commit is contained in:
parent
17091476d8
commit
c00b0b6ae4
@ -466,11 +466,11 @@ void CLDNNGraph::UpdatePerfStatistics() {
|
||||
using duration_t = std::chrono::duration<long long, std::chrono::microseconds::period>;
|
||||
auto count = std::chrono::duration_cast<duration_t>(interval.value->value()).count();
|
||||
|
||||
if (interval.name == "submission") {
|
||||
if (interval.stage == cldnn::instrumentation::profiling_stage::submission) {
|
||||
pc.cpu_uSec += count;
|
||||
} else if (interval.name == "executing") {
|
||||
} else if (interval.stage == cldnn::instrumentation::profiling_stage::executing) {
|
||||
pc.realTime_uSec += count;
|
||||
} else if (interval.name == "duration") { // "duration" is used for CPU layers
|
||||
} else if (interval.stage == cldnn::instrumentation::profiling_stage::duration) { // "duration" is used for CPU layers
|
||||
pc.cpu_uSec += count;
|
||||
|
||||
if (pc.num == 0)
|
||||
@ -673,11 +673,11 @@ std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> CLDNNGraph::G
|
||||
using duration_t = std::chrono::duration<long long, std::chrono::microseconds::period>;
|
||||
auto count = std::chrono::duration_cast<duration_t>(interval.value->value()).count();
|
||||
|
||||
if (interval.name == "submission") {
|
||||
if (interval.stage == cldnn::instrumentation::profiling_stage::submission) {
|
||||
cpuTime += count;
|
||||
} else if (interval.name == "executing") {
|
||||
} else if (interval.stage == cldnn::instrumentation::profiling_stage::executing) {
|
||||
deviceTime += count;
|
||||
} else if (interval.name == "duration") { // "duration" is used for CPU layers
|
||||
} else if (interval.stage == cldnn::instrumentation::profiling_stage::duration) { // "duration" is used for CPU layers
|
||||
cpuTime += count;
|
||||
}
|
||||
}
|
||||
|
@ -17,6 +17,14 @@ namespace instrumentation {
|
||||
/// @addtogroup cpp_event Events Support
|
||||
/// @{
|
||||
|
||||
/// @brief Represents profiling intervals stages.
|
||||
enum class profiling_stage {
|
||||
submission, // Time spent on submitting command by the host to the device associated with the commandqueue.
|
||||
starting, // Time spent on waiting in the commandqueue before execution.
|
||||
executing, // Time spent on command execution.
|
||||
duration // Time spent on command execution for CPU layers.
|
||||
};
|
||||
|
||||
/// @brief Helper class to calculate time periods.
|
||||
template <class ClockTy = std::chrono::steady_clock>
|
||||
class timer {
|
||||
@ -55,9 +63,9 @@ private:
|
||||
std::chrono::nanoseconds _value;
|
||||
};
|
||||
|
||||
/// @brief Represents prifiling interval as its name and value.
|
||||
/// @brief Represents profiling interval as its type and value.
|
||||
struct profiling_interval {
|
||||
std::string name; ///< @brief Display name.
|
||||
profiling_stage stage; ///< @brief Display name.
|
||||
std::shared_ptr<profiling_period> value; ///< @brief Interval value.
|
||||
};
|
||||
|
||||
|
@ -16,7 +16,7 @@ namespace cldnn {
|
||||
namespace ocl {
|
||||
|
||||
struct profiling_period_ocl_start_stop {
|
||||
const char* name;
|
||||
instrumentation::profiling_stage stage;
|
||||
cl_profiling_info start;
|
||||
cl_profiling_info stop;
|
||||
};
|
||||
|
@ -25,10 +25,10 @@ bool is_event_profiled(const cl::Event& event) {
|
||||
return false;
|
||||
}
|
||||
|
||||
instrumentation::profiling_interval get_profiling_interval(const char* name, cl_ulong start, cl_ulong end) {
|
||||
instrumentation::profiling_interval get_profiling_interval(instrumentation::profiling_stage stage, cl_ulong start, cl_ulong end) {
|
||||
auto diff = std::chrono::nanoseconds(end - start);
|
||||
auto period = std::make_shared<instrumentation::profiling_period_basic>(diff);
|
||||
return { name, period };
|
||||
return { stage, period };
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@ -71,9 +71,9 @@ bool ocl_event::add_event_handler_impl(event_handler, void*) {
|
||||
}
|
||||
|
||||
static const std::vector<profiling_period_ocl_start_stop> profiling_periods{
|
||||
{"submission", CL_PROFILING_COMMAND_QUEUED, CL_PROFILING_COMMAND_SUBMIT},
|
||||
{"starting", CL_PROFILING_COMMAND_SUBMIT, CL_PROFILING_COMMAND_START},
|
||||
{"executing", CL_PROFILING_COMMAND_START, CL_PROFILING_COMMAND_END},
|
||||
{ instrumentation::profiling_stage::submission, CL_PROFILING_COMMAND_QUEUED, CL_PROFILING_COMMAND_SUBMIT },
|
||||
{ instrumentation::profiling_stage::starting, CL_PROFILING_COMMAND_SUBMIT, CL_PROFILING_COMMAND_START },
|
||||
{ instrumentation::profiling_stage::executing, CL_PROFILING_COMMAND_START, CL_PROFILING_COMMAND_END },
|
||||
};
|
||||
|
||||
bool ocl_event::get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) {
|
||||
@ -87,7 +87,7 @@ bool ocl_event::get_profiling_info_impl(std::list<instrumentation::profiling_int
|
||||
_event.getProfilingInfo(period.start, &start);
|
||||
_event.getProfilingInfo(period.stop, &end);
|
||||
|
||||
info.push_back(get_profiling_interval(period.name, start, end));
|
||||
info.push_back(get_profiling_interval(period.stage, start, end));
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -114,7 +114,7 @@ bool ocl_events::get_profiling_info_impl(std::list<instrumentation::profiling_in
|
||||
// For every profiling period (i.e. submission / starting / executing),
|
||||
// the goal is to sum up all disjoint durations of its projection on the time axis
|
||||
|
||||
std::map<std::string, std::vector<std::pair<unsigned long long, unsigned long long>>> all_durations;
|
||||
std::map<instrumentation::profiling_stage, std::vector<std::pair<unsigned long long, unsigned long long>>> all_durations;
|
||||
|
||||
for (size_t i = 0; i < _events.size(); i++) {
|
||||
auto be = downcast<ocl_event>(_events[i].get());
|
||||
@ -129,7 +129,7 @@ bool ocl_events::get_profiling_info_impl(std::list<instrumentation::profiling_in
|
||||
auto ev_duration = std::make_pair(static_cast<unsigned long long>(ev_start),
|
||||
static_cast<unsigned long long>(ev_end));
|
||||
|
||||
auto& durations = all_durations[period.name];
|
||||
auto& durations = all_durations[period.stage];
|
||||
bool ev_duration_merged = false;
|
||||
auto it = durations.begin();
|
||||
|
||||
@ -173,21 +173,21 @@ bool ocl_events::get_profiling_info_impl(std::list<instrumentation::profiling_in
|
||||
|
||||
for (auto& period : profiling_periods) {
|
||||
unsigned long long sum = 0;
|
||||
for (auto& duration : all_durations[period.name]) {
|
||||
for (auto& duration : all_durations[period.stage]) {
|
||||
sum += (duration.second - duration.first);
|
||||
}
|
||||
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->print_multi_kernel_perf) {
|
||||
if (0 == strcmp(period.name, "executing")) {
|
||||
if (period.stage == instrumentation::profiling_stage::executing) {
|
||||
GPU_DEBUG_COUT << "Multi-kernel time: ";
|
||||
for (auto& duration : all_durations[period.name])
|
||||
for (auto& duration : all_durations[period.stage])
|
||||
std::cout << " " << (duration.second - duration.first) / 1000;
|
||||
std::cout << " Total " << sum / 1000 << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
info.push_back(get_profiling_interval(period.name, 0, sum));
|
||||
info.push_back(get_profiling_interval(period.stage, 0, sum));
|
||||
}
|
||||
|
||||
return true;
|
||||
|
@ -23,7 +23,7 @@ bool ocl_user_event::get_profiling_info_impl(std::list<cldnn::instrumentation::p
|
||||
}
|
||||
|
||||
auto period = std::make_shared<instrumentation::profiling_period_basic>(_duration->value());
|
||||
info.push_back({"duration", period });
|
||||
info.push_back({ instrumentation::profiling_stage::executing, period });
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -115,15 +115,16 @@ protected:
|
||||
auto profiling = engine.configuration().enable_profiling;
|
||||
event::ptr event;
|
||||
|
||||
if (profiling)
|
||||
if (profiling) {
|
||||
stream.finish();
|
||||
event = stream.create_user_event(false);
|
||||
}
|
||||
|
||||
_prim.execute(stream.get_onednn_stream(), _args);
|
||||
|
||||
if (profiling) {
|
||||
// Measure all previously queued tasks
|
||||
event = stream.enqueue_marker({}, true);
|
||||
stream.wait_for_events({event});
|
||||
stream.finish();
|
||||
event->set();
|
||||
} else {
|
||||
// Create and set user event as complete
|
||||
event = stream.create_user_event(true);
|
||||
|
@ -215,7 +215,7 @@ std::vector<std::chrono::nanoseconds> kernel_runner::run_kernels(const kernel_se
|
||||
if (event.get() != NULL) {
|
||||
auto profiling_intervals = event->get_profiling_info();
|
||||
for (auto const& profiling_interval : profiling_intervals) {
|
||||
if (profiling_interval.name == "executing") {
|
||||
if (profiling_interval.stage == instrumentation::profiling_stage::executing) {
|
||||
kernel_run_time = std::min(profiling_interval.value->value(), kernel_run_time);
|
||||
num_of_runs++;
|
||||
break;
|
||||
|
Loading…
Reference in New Issue
Block a user