[GPU] Profiling refactoring and fixes (#7714)
This commit is contained in:
parent
17091476d8
commit
c00b0b6ae4
@ -466,11 +466,11 @@ void CLDNNGraph::UpdatePerfStatistics() {
|
|||||||
using duration_t = std::chrono::duration<long long, std::chrono::microseconds::period>;
|
using duration_t = std::chrono::duration<long long, std::chrono::microseconds::period>;
|
||||||
auto count = std::chrono::duration_cast<duration_t>(interval.value->value()).count();
|
auto count = std::chrono::duration_cast<duration_t>(interval.value->value()).count();
|
||||||
|
|
||||||
if (interval.name == "submission") {
|
if (interval.stage == cldnn::instrumentation::profiling_stage::submission) {
|
||||||
pc.cpu_uSec += count;
|
pc.cpu_uSec += count;
|
||||||
} else if (interval.name == "executing") {
|
} else if (interval.stage == cldnn::instrumentation::profiling_stage::executing) {
|
||||||
pc.realTime_uSec += count;
|
pc.realTime_uSec += count;
|
||||||
} else if (interval.name == "duration") { // "duration" is used for CPU layers
|
} else if (interval.stage == cldnn::instrumentation::profiling_stage::duration) { // "duration" is used for CPU layers
|
||||||
pc.cpu_uSec += count;
|
pc.cpu_uSec += count;
|
||||||
|
|
||||||
if (pc.num == 0)
|
if (pc.num == 0)
|
||||||
@ -673,11 +673,11 @@ std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> CLDNNGraph::G
|
|||||||
using duration_t = std::chrono::duration<long long, std::chrono::microseconds::period>;
|
using duration_t = std::chrono::duration<long long, std::chrono::microseconds::period>;
|
||||||
auto count = std::chrono::duration_cast<duration_t>(interval.value->value()).count();
|
auto count = std::chrono::duration_cast<duration_t>(interval.value->value()).count();
|
||||||
|
|
||||||
if (interval.name == "submission") {
|
if (interval.stage == cldnn::instrumentation::profiling_stage::submission) {
|
||||||
cpuTime += count;
|
cpuTime += count;
|
||||||
} else if (interval.name == "executing") {
|
} else if (interval.stage == cldnn::instrumentation::profiling_stage::executing) {
|
||||||
deviceTime += count;
|
deviceTime += count;
|
||||||
} else if (interval.name == "duration") { // "duration" is used for CPU layers
|
} else if (interval.stage == cldnn::instrumentation::profiling_stage::duration) { // "duration" is used for CPU layers
|
||||||
cpuTime += count;
|
cpuTime += count;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -17,6 +17,14 @@ namespace instrumentation {
|
|||||||
/// @addtogroup cpp_event Events Support
|
/// @addtogroup cpp_event Events Support
|
||||||
/// @{
|
/// @{
|
||||||
|
|
||||||
|
/// @brief Represents profiling intervals stages.
|
||||||
|
enum class profiling_stage {
|
||||||
|
submission, // Time spent on submitting command by the host to the device associated with the commandqueue.
|
||||||
|
starting, // Time spent on waiting in the commandqueue before execution.
|
||||||
|
executing, // Time spent on command execution.
|
||||||
|
duration // Time spent on command execution for CPU layers.
|
||||||
|
};
|
||||||
|
|
||||||
/// @brief Helper class to calculate time periods.
|
/// @brief Helper class to calculate time periods.
|
||||||
template <class ClockTy = std::chrono::steady_clock>
|
template <class ClockTy = std::chrono::steady_clock>
|
||||||
class timer {
|
class timer {
|
||||||
@ -55,9 +63,9 @@ private:
|
|||||||
std::chrono::nanoseconds _value;
|
std::chrono::nanoseconds _value;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// @brief Represents prifiling interval as its name and value.
|
/// @brief Represents profiling interval as its type and value.
|
||||||
struct profiling_interval {
|
struct profiling_interval {
|
||||||
std::string name; ///< @brief Display name.
|
profiling_stage stage; ///< @brief Display name.
|
||||||
std::shared_ptr<profiling_period> value; ///< @brief Interval value.
|
std::shared_ptr<profiling_period> value; ///< @brief Interval value.
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ namespace cldnn {
|
|||||||
namespace ocl {
|
namespace ocl {
|
||||||
|
|
||||||
struct profiling_period_ocl_start_stop {
|
struct profiling_period_ocl_start_stop {
|
||||||
const char* name;
|
instrumentation::profiling_stage stage;
|
||||||
cl_profiling_info start;
|
cl_profiling_info start;
|
||||||
cl_profiling_info stop;
|
cl_profiling_info stop;
|
||||||
};
|
};
|
||||||
|
@ -25,10 +25,10 @@ bool is_event_profiled(const cl::Event& event) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
instrumentation::profiling_interval get_profiling_interval(const char* name, cl_ulong start, cl_ulong end) {
|
instrumentation::profiling_interval get_profiling_interval(instrumentation::profiling_stage stage, cl_ulong start, cl_ulong end) {
|
||||||
auto diff = std::chrono::nanoseconds(end - start);
|
auto diff = std::chrono::nanoseconds(end - start);
|
||||||
auto period = std::make_shared<instrumentation::profiling_period_basic>(diff);
|
auto period = std::make_shared<instrumentation::profiling_period_basic>(diff);
|
||||||
return { name, period };
|
return { stage, period };
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
@ -71,9 +71,9 @@ bool ocl_event::add_event_handler_impl(event_handler, void*) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static const std::vector<profiling_period_ocl_start_stop> profiling_periods{
|
static const std::vector<profiling_period_ocl_start_stop> profiling_periods{
|
||||||
{"submission", CL_PROFILING_COMMAND_QUEUED, CL_PROFILING_COMMAND_SUBMIT},
|
{ instrumentation::profiling_stage::submission, CL_PROFILING_COMMAND_QUEUED, CL_PROFILING_COMMAND_SUBMIT },
|
||||||
{"starting", CL_PROFILING_COMMAND_SUBMIT, CL_PROFILING_COMMAND_START},
|
{ instrumentation::profiling_stage::starting, CL_PROFILING_COMMAND_SUBMIT, CL_PROFILING_COMMAND_START },
|
||||||
{"executing", CL_PROFILING_COMMAND_START, CL_PROFILING_COMMAND_END},
|
{ instrumentation::profiling_stage::executing, CL_PROFILING_COMMAND_START, CL_PROFILING_COMMAND_END },
|
||||||
};
|
};
|
||||||
|
|
||||||
bool ocl_event::get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) {
|
bool ocl_event::get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) {
|
||||||
@ -87,7 +87,7 @@ bool ocl_event::get_profiling_info_impl(std::list<instrumentation::profiling_int
|
|||||||
_event.getProfilingInfo(period.start, &start);
|
_event.getProfilingInfo(period.start, &start);
|
||||||
_event.getProfilingInfo(period.stop, &end);
|
_event.getProfilingInfo(period.stop, &end);
|
||||||
|
|
||||||
info.push_back(get_profiling_interval(period.name, start, end));
|
info.push_back(get_profiling_interval(period.stage, start, end));
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
@ -114,7 +114,7 @@ bool ocl_events::get_profiling_info_impl(std::list<instrumentation::profiling_in
|
|||||||
// For every profiling period (i.e. submission / starting / executing),
|
// For every profiling period (i.e. submission / starting / executing),
|
||||||
// the goal is to sum up all disjoint durations of its projection on the time axis
|
// the goal is to sum up all disjoint durations of its projection on the time axis
|
||||||
|
|
||||||
std::map<std::string, std::vector<std::pair<unsigned long long, unsigned long long>>> all_durations;
|
std::map<instrumentation::profiling_stage, std::vector<std::pair<unsigned long long, unsigned long long>>> all_durations;
|
||||||
|
|
||||||
for (size_t i = 0; i < _events.size(); i++) {
|
for (size_t i = 0; i < _events.size(); i++) {
|
||||||
auto be = downcast<ocl_event>(_events[i].get());
|
auto be = downcast<ocl_event>(_events[i].get());
|
||||||
@ -129,7 +129,7 @@ bool ocl_events::get_profiling_info_impl(std::list<instrumentation::profiling_in
|
|||||||
auto ev_duration = std::make_pair(static_cast<unsigned long long>(ev_start),
|
auto ev_duration = std::make_pair(static_cast<unsigned long long>(ev_start),
|
||||||
static_cast<unsigned long long>(ev_end));
|
static_cast<unsigned long long>(ev_end));
|
||||||
|
|
||||||
auto& durations = all_durations[period.name];
|
auto& durations = all_durations[period.stage];
|
||||||
bool ev_duration_merged = false;
|
bool ev_duration_merged = false;
|
||||||
auto it = durations.begin();
|
auto it = durations.begin();
|
||||||
|
|
||||||
@ -173,21 +173,21 @@ bool ocl_events::get_profiling_info_impl(std::list<instrumentation::profiling_in
|
|||||||
|
|
||||||
for (auto& period : profiling_periods) {
|
for (auto& period : profiling_periods) {
|
||||||
unsigned long long sum = 0;
|
unsigned long long sum = 0;
|
||||||
for (auto& duration : all_durations[period.name]) {
|
for (auto& duration : all_durations[period.stage]) {
|
||||||
sum += (duration.second - duration.first);
|
sum += (duration.second - duration.first);
|
||||||
}
|
}
|
||||||
|
|
||||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||||
GPU_DEBUG_IF(debug_config->print_multi_kernel_perf) {
|
GPU_DEBUG_IF(debug_config->print_multi_kernel_perf) {
|
||||||
if (0 == strcmp(period.name, "executing")) {
|
if (period.stage == instrumentation::profiling_stage::executing) {
|
||||||
GPU_DEBUG_COUT << "Multi-kernel time: ";
|
GPU_DEBUG_COUT << "Multi-kernel time: ";
|
||||||
for (auto& duration : all_durations[period.name])
|
for (auto& duration : all_durations[period.stage])
|
||||||
std::cout << " " << (duration.second - duration.first) / 1000;
|
std::cout << " " << (duration.second - duration.first) / 1000;
|
||||||
std::cout << " Total " << sum / 1000 << std::endl;
|
std::cout << " Total " << sum / 1000 << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
info.push_back(get_profiling_interval(period.name, 0, sum));
|
info.push_back(get_profiling_interval(period.stage, 0, sum));
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
@ -23,7 +23,7 @@ bool ocl_user_event::get_profiling_info_impl(std::list<cldnn::instrumentation::p
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto period = std::make_shared<instrumentation::profiling_period_basic>(_duration->value());
|
auto period = std::make_shared<instrumentation::profiling_period_basic>(_duration->value());
|
||||||
info.push_back({"duration", period });
|
info.push_back({ instrumentation::profiling_stage::executing, period });
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -115,15 +115,16 @@ protected:
|
|||||||
auto profiling = engine.configuration().enable_profiling;
|
auto profiling = engine.configuration().enable_profiling;
|
||||||
event::ptr event;
|
event::ptr event;
|
||||||
|
|
||||||
if (profiling)
|
if (profiling) {
|
||||||
stream.finish();
|
stream.finish();
|
||||||
|
event = stream.create_user_event(false);
|
||||||
|
}
|
||||||
|
|
||||||
_prim.execute(stream.get_onednn_stream(), _args);
|
_prim.execute(stream.get_onednn_stream(), _args);
|
||||||
|
|
||||||
if (profiling) {
|
if (profiling) {
|
||||||
// Measure all previously queued tasks
|
stream.finish();
|
||||||
event = stream.enqueue_marker({}, true);
|
event->set();
|
||||||
stream.wait_for_events({event});
|
|
||||||
} else {
|
} else {
|
||||||
// Create and set user event as complete
|
// Create and set user event as complete
|
||||||
event = stream.create_user_event(true);
|
event = stream.create_user_event(true);
|
||||||
|
@ -215,7 +215,7 @@ std::vector<std::chrono::nanoseconds> kernel_runner::run_kernels(const kernel_se
|
|||||||
if (event.get() != NULL) {
|
if (event.get() != NULL) {
|
||||||
auto profiling_intervals = event->get_profiling_info();
|
auto profiling_intervals = event->get_profiling_info();
|
||||||
for (auto const& profiling_interval : profiling_intervals) {
|
for (auto const& profiling_interval : profiling_intervals) {
|
||||||
if (profiling_interval.name == "executing") {
|
if (profiling_interval.stage == instrumentation::profiling_stage::executing) {
|
||||||
kernel_run_time = std::min(profiling_interval.value->value(), kernel_run_time);
|
kernel_run_time = std::min(profiling_interval.value->value(), kernel_run_time);
|
||||||
num_of_runs++;
|
num_of_runs++;
|
||||||
break;
|
break;
|
||||||
|
Loading…
Reference in New Issue
Block a user