From 152511daa8ce5612c035191a831faec5c8029a57 Mon Sep 17 00:00:00 2001 From: 18582088138 <752038@qq.com> Date: Wed, 19 Oct 2022 22:59:12 +0800 Subject: [PATCH] Benchmark sort opt (#12350) * Benchmark_app python and c/c++ script add op profiling feature * Fix compile bug about benchmark c/c++ op profiling * Fix Compile Bug * Fix issue in PR#12350 : 1. report_type info update; 2. priority of -pc and -pcsort; 3.align format between c++ and python; 4.check 'proportion' for 0 and print N/A; 5.detect the confusing print info * Fix 8/17 review suggestion * Fix 8/23 suggestion * Fix the clang-format issue * Fix the win-cc issue * Fix win-cc issue about conversion from 'double' to 'float', possible loss of data * Push PR by ODT group account Co-authored-by: Chen Peter Co-authored-by: Fiona Zhao --- samples/cpp/benchmark_app/benchmark_app.hpp | 11 + samples/cpp/benchmark_app/main.cpp | 26 ++- .../cpp/benchmark_app/statistics_report.cpp | 94 ++++++++ .../cpp/benchmark_app/statistics_report.hpp | 9 + .../common/utils/include/samples/common.hpp | 205 ++++++++++++++++++ .../openvino/tools/benchmark/main.py | 15 +- .../openvino/tools/benchmark/parameters.py | 6 + .../benchmark/utils/statistics_report.py | 23 +- .../openvino/tools/benchmark/utils/utils.py | 62 ++++++ 9 files changed, 444 insertions(+), 7 deletions(-) mode change 100644 => 100755 samples/cpp/benchmark_app/main.cpp diff --git a/samples/cpp/benchmark_app/benchmark_app.hpp b/samples/cpp/benchmark_app/benchmark_app.hpp index d1d76fe3c8a..d02f04c9e02 100644 --- a/samples/cpp/benchmark_app/benchmark_app.hpp +++ b/samples/cpp/benchmark_app/benchmark_app.hpp @@ -157,6 +157,13 @@ static const char progress_message[] = // @brief message for performance counters option static const char pc_message[] = "Optional. Report performance counters."; +// @brief message for sorted performance counters option +static const char pc_sort_message[] = + "Optional. Report performance counters and analysis the sort hotpoint opts. " + " \"sort\" Analysis opts time cost, print by hotpoint order " + " \"no_sort\" Analysis opts time cost, print by normal order " + " \"simple_sort\" Analysis opts time cost, only print EXECUTED opts by normal order"; + // @brief message for performance counters for sequence option static const char pcseq_message[] = "Optional. Report latencies for each shape in -data_shape sequence."; @@ -315,6 +322,9 @@ DEFINE_bool(progress, false, progress_message); /// @brief Define flag for showing performance counters
DEFINE_bool(pc, false, pc_message); +/// @brief Define flag for showing sorted performance counters
+DEFINE_string(pcsort, "", pc_sort_message); + /// @brief Define flag for showing performance sequence counters
DEFINE_bool(pcseq, false, pcseq_message); @@ -411,6 +421,7 @@ static void show_usage() { std::cout << " -json_stats " << json_stats_message << std::endl; std::cout << " -exec_graph_path " << exec_graph_path_message << std::endl; std::cout << " -pc " << pc_message << std::endl; + std::cout << " -pcsort " << pc_sort_message << std::endl; std::cout << " -pcseq " << pcseq_message << std::endl; std::cout << " -dump_config " << dump_config_message << std::endl; std::cout << " -load_config " << load_config_message << std::endl; diff --git a/samples/cpp/benchmark_app/main.cpp b/samples/cpp/benchmark_app/main.cpp old mode 100644 new mode 100755 index 71b8a0fd91d..85e84c7cdc9 --- a/samples/cpp/benchmark_app/main.cpp +++ b/samples/cpp/benchmark_app/main.cpp @@ -66,9 +66,9 @@ bool parse_and_check_command_line(int argc, char* argv[]) { "(those options can be used in OpenVINO together), but a benchmark_app UI rule."); } if (!FLAGS_report_type.empty() && FLAGS_report_type != noCntReport && FLAGS_report_type != averageCntReport && - FLAGS_report_type != detailedCntReport) { + FLAGS_report_type != detailedCntReport && FLAGS_report_type != sortDetailedCntReport) { std::string err = "only " + std::string(noCntReport) + "/" + std::string(averageCntReport) + "/" + - std::string(detailedCntReport) + + std::string(detailedCntReport) + "/" + std::string(sortDetailedCntReport) + " report types are supported (invalid -report_type option value)"; throw std::logic_error(err); } @@ -77,6 +77,12 @@ bool parse_and_check_command_line(int argc, char* argv[]) { throw std::logic_error("only " + std::string(detailedCntReport) + " report type is supported for MULTI device"); } + if (!FLAGS_pcsort.empty() && FLAGS_pcsort != "sort" && FLAGS_pcsort != "no_sort" && FLAGS_pcsort != "simple_sort") { + std::string pcsort_err = std::string("Incorrect performance count sort . Please set -pcsort option to ") + + std::string("'sort', 'no_sort', 'simple_sort'."); + throw std::logic_error(pcsort_err); + } + bool isNetworkCompiled = fileExt(FLAGS_m) == "blob"; bool isPrecisionSet = !(FLAGS_ip.empty() && FLAGS_op.empty() && FLAGS_iop.empty()); if (isNetworkCompiled && isPrecisionSet) { @@ -304,7 +310,8 @@ int main(int argc, char* argv[]) { (device_config.at(ov::enable_profiling.name()).as())) { slog::warn << "Performance counters for " << device << " device is turned on. To print results use -pc option." << slog::endl; - } else if (FLAGS_report_type == detailedCntReport || FLAGS_report_type == averageCntReport) { + } else if (FLAGS_report_type == detailedCntReport || FLAGS_report_type == averageCntReport || + FLAGS_report_type == sortDetailedCntReport) { slog::warn << "Turn on performance counters for " << device << " device since report type is " << FLAGS_report_type << "." << slog::endl; device_config.emplace(ov::enable_profiling(true)); @@ -312,6 +319,10 @@ int main(int argc, char* argv[]) { slog::warn << "Turn on performance counters for " << device << " device due to execution graph dumping." << slog::endl; device_config.emplace(ov::enable_profiling(true)); + } else if (!FLAGS_pcsort.empty()) { + slog::warn << "Turn on sorted performance counters for " << device << " device since pcsort is" + << FLAGS_pcsort << "." << slog::endl; + device_config.emplace(ov::enable_profiling(true)); } else { // set to default value device_config.emplace(ov::enable_profiling(FLAGS_pc)); @@ -1171,7 +1182,14 @@ int main(int argc, char* argv[]) { std::vector> perfCounts; for (size_t ireq = 0; ireq < nireq; ireq++) { auto reqPerfCounts = inferRequestsQueue.requests[ireq]->get_performance_counts(); - if (FLAGS_pc) { + if (!FLAGS_pcsort.empty()) { + slog::info << "Sort performance counts for " << ireq << "-th infer request:" << slog::endl; + printPerformanceCountsSort(reqPerfCounts, + std::cout, + getFullDeviceName(core, FLAGS_d), + FLAGS_pcsort, + false); + } else if (FLAGS_pc) { slog::info << "Performance counts for " << ireq << "-th infer request:" << slog::endl; printPerformanceCounts(reqPerfCounts, std::cout, getFullDeviceName(core, FLAGS_d), false); } diff --git a/samples/cpp/benchmark_app/statistics_report.cpp b/samples/cpp/benchmark_app/statistics_report.cpp index 89f8a74e747..c8c89ac07b7 100644 --- a/samples/cpp/benchmark_app/statistics_report.cpp +++ b/samples/cpp/benchmark_app/statistics_report.cpp @@ -93,6 +93,7 @@ void StatisticsReport::dump_performance_counters_request(CsvDumper& dumper, cons total_cpu += layer.cpu_time; dumper.endLine(); } + dumper << "Total" << "" << "" @@ -102,6 +103,55 @@ void StatisticsReport::dump_performance_counters_request(CsvDumper& dumper, cons dumper.endLine(); } +void StatisticsReport::dump_sort_performance_counters_request(CsvDumper& dumper, + const PerformanceCounters& perfCounts) { + std::chrono::microseconds total = std::chrono::microseconds::zero(); + std::chrono::microseconds total_cpu = std::chrono::microseconds::zero(); + int layersize = 0; + + dumper << "layerName" + << "execStatus" + << "layerType" + << "execType"; + dumper << "realTime (ms)" + << "cpuTime (ms)" + << " %"; + dumper.endLine(); + + for (const auto& layer : perfCounts) { + if (status_names[(int)layer.status] == "EXECUTED") { + total += layer.real_time; + total_cpu += layer.cpu_time; + } + } + + // sort perfcounter + std::vector profiling{std::begin(perfCounts), std::end(perfCounts)}; + std::sort(profiling.begin(), profiling.end(), sort_profiling_descend); + for (const auto& layer : profiling) { + if (status_names[(int)layer.status] == "EXECUTED") { + dumper << layer.node_name; // layer name + dumper << ((int)layer.status < (sizeof(status_names) / sizeof(status_names[0])) + ? status_names[(int)layer.status] + : "INVALID_STATUS"); + dumper << layer.node_type << layer.exec_type; + dumper << std::to_string(layer.real_time.count() / 1000.0) + << std::to_string(layer.cpu_time.count() / 1000.0); + dumper << (layer.real_time * 1.0 / total) * 100; + dumper.endLine(); + layersize += 1; + } + } + + dumper << "Total" + << "" + << "" + << ""; + dumper << total.count() / 1000.0 << total_cpu.count() / 1000.0 << 100.0; + dumper.endLine(); + dumper.endLine(); +} + StatisticsReport::PerformanceCounters StatisticsReport::get_average_performance_counters( const std::vector& perfCounts) { StatisticsReport::PerformanceCounters performanceCountersAvg; @@ -148,6 +198,10 @@ void StatisticsReport::dump_performance_counters(const std::vector sortPerfCounts{std::begin(perfCounts), std::end(perfCounts)}; + std::sort(sortPerfCounts.begin(), sortPerfCounts.end(), sort_profiling_descend); + + for (const auto& layer : sortPerfCounts) { + nlohmann::json item; + item["name"] = layer.node_name; // layer name + item["status"] = + ((int)layer.status < (sizeof(status_names) / sizeof(status_names[0])) ? status_names[(int)layer.status] + : "INVALID_STATUS"); + item["node_type"] = layer.node_type; + item["exec_type"] = layer.exec_type; + item["real_time"] = layer.real_time.count() / 1000.0; + item["cpu_time"] = layer.cpu_time.count() / 1000.0; + item["%"] = std::round(layer.real_time * 10000.0 / total) / 100; + js["nodes"].push_back(item); + } + + js["total_real_time"] = total.count() / 1000.0; + js["total_cpu_time"] = total_cpu.count() / 1000.0; + return js; +} + void LatencyMetrics::write_to_stream(std::ostream& stream) const { std::ios::fmtflags fmt(std::cout.flags()); stream << data_shape << ";" << std::fixed << std::setprecision(2) << median_or_percentile << ";" << avg << ";" diff --git a/samples/cpp/benchmark_app/statistics_report.hpp b/samples/cpp/benchmark_app/statistics_report.hpp index d54d46e58b2..cd0128fc377 100644 --- a/samples/cpp/benchmark_app/statistics_report.hpp +++ b/samples/cpp/benchmark_app/statistics_report.hpp @@ -22,6 +22,7 @@ static constexpr char noCntReport[] = "no_counters"; static constexpr char averageCntReport[] = "average_counters"; static constexpr char detailedCntReport[] = "detailed_counters"; +static constexpr char sortDetailedCntReport[] = "sort_detailed_counters"; /// @brief Responsible for calculating different latency metrics class LatencyMetrics { @@ -143,6 +144,10 @@ public: private: void dump_performance_counters_request(CsvDumper& dumper, const PerformanceCounters& perfCounts); + void dump_sort_performance_counters_request(CsvDumper& dumper, const PerformanceCounters& perfCounts); + static bool sort_profiling_descend(const ov::ProfilingInfo& profiling1, const ov::ProfilingInfo& profiling2) { + return profiling1.real_time > profiling2.real_time; + } protected: // configuration of current benchmark execution @@ -168,4 +173,8 @@ public: private: void dump_parameters(nlohmann::json& js, const StatisticsReport::Parameters& parameters); const nlohmann::json perf_counters_to_json(const StatisticsReport::PerformanceCounters& perfCounts); + const nlohmann::json sort_perf_counters_to_json(const StatisticsReport::PerformanceCounters& perfCounts); + static bool sort_profiling_descend(const ov::ProfilingInfo& profiling1, const ov::ProfilingInfo& profiling2) { + return profiling1.real_time > profiling2.real_time; + } }; diff --git a/samples/cpp/common/utils/include/samples/common.hpp b/samples/cpp/common/utils/include/samples/common.hpp index 1dbfd43f917..fe4ff5b610b 100644 --- a/samples/cpp/common/utils/include/samples/common.hpp +++ b/samples/cpp/common/utils/include/samples/common.hpp @@ -23,11 +23,18 @@ #include #include +using std::setprecision; + // clang-format off #include "openvino/openvino.hpp" #include "slog.hpp" // clang-format on +// @brief performance counters sort +static constexpr char pcSort[] = "sort"; +static constexpr char pcNoSort[] = "no_sort"; +static constexpr char pcSimpleSort[] = "simple_sort"; + #ifndef UNUSED # if defined(_MSC_VER) && !defined(__clang__) # define UNUSED @@ -1048,3 +1055,201 @@ static UNUSED void printPerformanceCounts(ov::InferRequest request, auto performanceMap = request.get_profiling_info(); printPerformanceCounts(performanceMap, stream, deviceName, bshowHeader); } + +static UNUSED void printPerformanceCountsNoSort(std::vector performanceData, + std::ostream& stream, + std::string deviceName, + bool bshowHeader = true) { + std::chrono::microseconds totalTime = std::chrono::microseconds::zero(); + // Print performance counts + if (bshowHeader) { + stream << std::endl << "performance counts:" << std::endl << std::endl; + } + std::ios::fmtflags fmt(std::cout.flags()); + + for (const auto& it : performanceData) { + if (it.real_time.count() > 0) { + totalTime += it.real_time; + } + } + if (totalTime.count() != 0) { + for (const auto& it : performanceData) { + std::string toPrint(it.node_name); + const int maxLayerName = 30; + + if (it.node_name.length() >= maxLayerName) { + toPrint = it.node_name.substr(0, maxLayerName - 4); + toPrint += "..."; + } + + stream << std::setw(maxLayerName) << std::left << toPrint; + switch (it.status) { + case ov::ProfilingInfo::Status::EXECUTED: + stream << std::setw(15) << std::left << "EXECUTED"; + break; + case ov::ProfilingInfo::Status::NOT_RUN: + stream << std::setw(15) << std::left << "NOT_RUN"; + break; + case ov::ProfilingInfo::Status::OPTIMIZED_OUT: + stream << std::setw(15) << std::left << "OPTIMIZED_OUT"; + break; + } + stream << std::setw(30) << std::left << "layerType: " + std::string(it.node_type) + " "; + stream << std::setw(20) << std::left << "realTime: " + std::to_string(it.real_time.count()); + stream << std::setw(15) << std::left << "cpu: " + std::to_string(it.cpu_time.count()); + double opt_proportion = it.real_time.count() * 100.0 / totalTime.count(); + std::stringstream opt_proportion_ss; + opt_proportion_ss << std::fixed << std::setprecision(2) << opt_proportion; + std::string opt_proportion_str = opt_proportion_ss.str(); + if (opt_proportion_str == "0.00") { + opt_proportion_str = "N/A"; + } + stream << std::setw(20) << std::left << "proportion: " + opt_proportion_str + "%"; + stream << " execType: " << it.exec_type << std::endl; + } + } + stream << std::setw(20) << std::left << "Total time: " + std::to_string(totalTime.count() / 1000.0) + << " microseconds" << std::endl; + std::cout << std::endl; + std::cout << "Full device name: " << deviceName << std::endl; + std::cout << std::endl; + std::cout.flags(fmt); +} + +static UNUSED bool sort_pc_descend(const ov::ProfilingInfo& profiling1, const ov::ProfilingInfo& profiling2) { + return profiling1.real_time > profiling2.real_time; +} + +static UNUSED void printPerformanceCountsDescendSort(std::vector performanceData, + std::ostream& stream, + std::string deviceName, + bool bshowHeader = true) { + std::chrono::microseconds totalTime = std::chrono::microseconds::zero(); + // Print performance counts + if (bshowHeader) { + stream << std::endl << "performance counts:" << std::endl << std::endl; + } + std::ios::fmtflags fmt(std::cout.flags()); + + for (const auto& it : performanceData) { + if (it.real_time.count() > 0) { + totalTime += it.real_time; + } + } + if (totalTime.count() != 0) { + // sort perfcounter + std::vector sortPerfCounts{std::begin(performanceData), std::end(performanceData)}; + std::sort(sortPerfCounts.begin(), sortPerfCounts.end(), sort_pc_descend); + + for (const auto& it : sortPerfCounts) { + std::string toPrint(it.node_name); + const int maxLayerName = 30; + + if (it.node_name.length() >= maxLayerName) { + toPrint = it.node_name.substr(0, maxLayerName - 4); + toPrint += "..."; + } + + stream << std::setw(maxLayerName) << std::left << toPrint; + switch (it.status) { + case ov::ProfilingInfo::Status::EXECUTED: + stream << std::setw(15) << std::left << "EXECUTED"; + break; + case ov::ProfilingInfo::Status::NOT_RUN: + stream << std::setw(15) << std::left << "NOT_RUN"; + break; + case ov::ProfilingInfo::Status::OPTIMIZED_OUT: + stream << std::setw(15) << std::left << "OPTIMIZED_OUT"; + break; + } + stream << std::setw(30) << std::left << "layerType: " + std::string(it.node_type) + " "; + stream << std::setw(20) << std::left << "realTime: " + std::to_string(it.real_time.count()); + stream << std::setw(15) << std::left << "cpu: " + std::to_string(it.cpu_time.count()); + double opt_proportion = it.real_time.count() * 100.0 / totalTime.count(); + std::stringstream opt_proportion_ss; + opt_proportion_ss << std::fixed << std::setprecision(2) << opt_proportion; + std::string opt_proportion_str = opt_proportion_ss.str(); + if (opt_proportion_str == "0.00") { + opt_proportion_str = "N/A"; + } + stream << std::setw(20) << std::left << "proportion: " + opt_proportion_str + "%"; + stream << " execType: " << it.exec_type << std::endl; + } + } + stream << std::setw(20) << std::left << "Total time: " + std::to_string(totalTime.count() / 1000.0) + << " microseconds" << std::endl; + std::cout << std::endl; + std::cout << "Full device name: " << deviceName << std::endl; + std::cout << std::endl; + std::cout.flags(fmt); +} + +static UNUSED void printPerformanceCountsSimpleSort(std::vector performanceData, + std::ostream& stream, + std::string deviceName, + bool bshowHeader = true) { + std::chrono::microseconds totalTime = std::chrono::microseconds::zero(); + // Print performance counts + if (bshowHeader) { + stream << std::endl << "performance counts:" << std::endl << std::endl; + } + std::ios::fmtflags fmt(std::cout.flags()); + + for (const auto& it : performanceData) { + if (it.real_time.count() > 0) { + totalTime += it.real_time; + } + } + if (totalTime.count() != 0) { + // sort perfcounter + std::vector sortPerfCounts{std::begin(performanceData), std::end(performanceData)}; + std::sort(sortPerfCounts.begin(), sortPerfCounts.end(), sort_pc_descend); + + for (const auto& it : sortPerfCounts) { + if (it.status == ov::ProfilingInfo::Status::EXECUTED) { + std::string toPrint(it.node_name); + const int maxLayerName = 30; + + if (it.node_name.length() >= maxLayerName) { + toPrint = it.node_name.substr(0, maxLayerName - 4); + toPrint += "..."; + } + + stream << std::setw(maxLayerName) << std::left << toPrint; + stream << std::setw(15) << std::left << "EXECUTED"; + stream << std::setw(30) << std::left << "layerType: " + std::string(it.node_type) + " "; + stream << std::setw(20) << std::left << "realTime: " + std::to_string(it.real_time.count()); + stream << std::setw(15) << std::left << "cpu: " + std::to_string(it.cpu_time.count()); + double opt_proportion = it.real_time.count() * 100.0 / totalTime.count(); + std::stringstream opt_proportion_ss; + opt_proportion_ss << std::fixed << std::setprecision(2) << opt_proportion; + std::string opt_proportion_str = opt_proportion_ss.str(); + if (opt_proportion_str == "0.00") { + opt_proportion_str = "N/A"; + } + stream << std::setw(20) << std::left << "proportion: " + opt_proportion_str + "%"; + stream << " execType: " << it.exec_type << std::endl; + } + } + } + stream << std::setw(20) << std::left << "Total time: " + std::to_string(totalTime.count() / 1000.0) + << " microseconds" << std::endl; + std::cout << std::endl; + std::cout << "Full device name: " << deviceName << std::endl; + std::cout << std::endl; + std::cout.flags(fmt); +} + +static UNUSED void printPerformanceCountsSort(std::vector performanceData, + std::ostream& stream, + std::string deviceName, + std::string sorttype, + bool bshowHeader = true) { + if (sorttype == pcNoSort) { + printPerformanceCountsNoSort(performanceData, stream, deviceName, bshowHeader); + } else if (sorttype == pcSort) { + printPerformanceCountsDescendSort(performanceData, stream, deviceName, bshowHeader); + } else if (sorttype == pcSimpleSort) { + printPerformanceCountsSimpleSort(performanceData, stream, deviceName, bshowHeader); + } +} diff --git a/tools/benchmark_tool/openvino/tools/benchmark/main.py b/tools/benchmark_tool/openvino/tools/benchmark/main.py index 598b9cf9a0d..60fce01fa20 100644 --- a/tools/benchmark_tool/openvino/tools/benchmark/main.py +++ b/tools/benchmark_tool/openvino/tools/benchmark/main.py @@ -15,7 +15,7 @@ from openvino.tools.benchmark.utils.inputs_filling import get_input_data from openvino.tools.benchmark.utils.logging import logger from openvino.tools.benchmark.utils.progress_bar import ProgressBar from openvino.tools.benchmark.utils.utils import next_step, get_number_iterations, pre_post_processing, \ - process_help_inference_string, print_perf_counters, dump_exec_graph, get_duration_in_milliseconds, \ + process_help_inference_string, print_perf_counters, print_perf_counters_sort, dump_exec_graph, get_duration_in_milliseconds, \ get_command_line_arguments, parse_value_per_device, parse_devices, get_inputs_info, \ print_inputs_and_outputs_info, get_network_batch_size, load_config, dump_config, get_latency_groups, \ check_for_static, can_measure_as_static @@ -156,6 +156,9 @@ def main(): logger.warning(f"Turn on performance counters for {device} device " + "due to execution graph dumping.") config[device]['PERF_COUNT'] = 'YES' + elif is_flag_set_in_command_line('pcsort'): + ## set to default value + config[device]['PERF_COUNT'] = 'YES' if args.perf_counts_sort else 'NO' else: ## set to default value config[device]['PERF_COUNT'] = 'YES' if args.perf_counts else 'NO' @@ -500,9 +503,17 @@ def main(): perfs_count_list = [] for request in requests: perfs_count_list.append(request.profiling_info) - if args.perf_counts: + + if args.perf_counts_sort: + total_sorted_list = print_perf_counters_sort(perfs_count_list,sort_flag=args.perf_counts_sort) + if statistics: + statistics.dump_performance_counters_sorted(total_sorted_list) + + elif args.perf_counts: print_perf_counters(perfs_count_list) + if statistics: + # if not args.perf_counts_sort: statistics.dump_performance_counters(perfs_count_list) if statistics: diff --git a/tools/benchmark_tool/openvino/tools/benchmark/parameters.py b/tools/benchmark_tool/openvino/tools/benchmark/parameters.py index fffee81bf6c..4516e258c6d 100644 --- a/tools/benchmark_tool/openvino/tools/benchmark/parameters.py +++ b/tools/benchmark_tool/openvino/tools/benchmark/parameters.py @@ -114,6 +114,12 @@ def parse_args(): help='Optional. Path to a file where to store executable graph information serialized.') args.add_argument('-pc', '--perf_counts', type=str2bool, required=False, default=False, nargs='?', const=True, help='Optional. Report performance counters.', ) + args.add_argument('-pcsort', '--perf_counts_sort', type=str, required=False, default="", + choices=['no_sort', 'sort', 'simple_sort'], + help='Optional. Report performance counters and analysis the sort hotpoint opts.' + ' sort: Analysis opts time cost, print by hotpoint order' + ' no_sort: Analysis opts time cost, print by normal order' + ' simple_sort: Analysis opts time cost, only print EXECUTED opts by normal order', ) args.add_argument('-pcseq', '--pcseq', type=str2bool, required=False, default=False, nargs='?', const=True, help='Optional. Report latencies for each shape in -data_shape sequence.', ) args.add_argument('-inference_only', '--inference_only', type=str2bool, required=False, default=None, nargs='?', const=True, diff --git a/tools/benchmark_tool/openvino/tools/benchmark/utils/statistics_report.py b/tools/benchmark_tool/openvino/tools/benchmark/utils/statistics_report.py index dcac35f36f8..1909335266d 100644 --- a/tools/benchmark_tool/openvino/tools/benchmark/utils/statistics_report.py +++ b/tools/benchmark_tool/openvino/tools/benchmark/utils/statistics_report.py @@ -5,7 +5,7 @@ from datetime import timedelta import os import sys from enum import Enum - +import csv from .logging import logger ## statistics reports types @@ -75,6 +75,27 @@ class StatisticsReport: str((total_cpu // timedelta(microseconds=1))/1000.0)])) f.write('\n\n') + def dump_performance_counters_sorted(self,prof_sorted_info): + """Save sorted performance counters into csv file. + """ + filename = os.path.join(self.config.report_folder, f'benchmark_sorted_report.csv') + total = 0 + total_cpu = 0 + with open(filename, 'w') as f: + writer = csv.writer(f) + writer.writerow(['LayerName', 'ExecStatus', 'LayerType', 'ExecType', 'RealTime (ms)', 'CPUTime (ms)' , 'Proportion(%) \n']) + for tmp_prof in prof_sorted_info: + writer.writerow([tmp_prof[0], str(tmp_prof[1]), + tmp_prof[2], tmp_prof[6], + str(tmp_prof[3]), str(tmp_prof[4]), + str("%.2f"%(tmp_prof[5]*100))+"%"]) + total += tmp_prof[3] + total_cpu += tmp_prof[4] + f.write('\n') + writer.writerow(["Total time cost: RealTime %.2fms , CPUTime %.2fms"%(total,total_cpu)]) + f.write('\n\n') + logger.info(f'Sorted performance counters report is stored to {filename}') + def dump_performance_counters(self, prof_info_list): if self.config.report_type == '' or self.config.report_type == noCntReport: logger.info("Statistics collecting for performance counters was not requested. No reports are dumped.") diff --git a/tools/benchmark_tool/openvino/tools/benchmark/utils/utils.py b/tools/benchmark_tool/openvino/tools/benchmark/utils/utils.py index d0d409cbb89..7a97c035140 100644 --- a/tools/benchmark_tool/openvino/tools/benchmark/utils/utils.py +++ b/tools/benchmark_tool/openvino/tools/benchmark/utils/utils.py @@ -309,6 +309,68 @@ def process_help_inference_string(benchmark_app, device_number_streams): def dump_exec_graph(compiled_model, model_path): serialize(compiled_model.get_runtime_model(), model_path) +def print_perf_counters_sort(perf_counts_list,sort_flag="sort"): + """ Print opts time cost and can be sorted according by each opts time cost + """ + for ni in range(len(perf_counts_list)): + perf_counts = perf_counts_list[ni] + total_time = datetime.timedelta() + total_time_cpu = datetime.timedelta() + logger.info(f"Performance counts sorted for {ni}-th infer request") + for pi in perf_counts: + total_time += pi.real_time + total_time_cpu += pi.cpu_time + + total_time = total_time.microseconds + total_time_cpu = total_time_cpu.microseconds + total_real_time_proportion = 0 + total_detail_data=[] + for pi in perf_counts: + node_name = pi.node_name + layerStatus = pi.status + layerType = pi.node_type + real_time = pi.real_time.microseconds + cpu_time = pi.cpu_time.microseconds + real_proportion = round(real_time/total_time,4) + execType = pi.exec_type + tmp_data=[node_name,layerStatus,layerType,real_time,cpu_time,real_proportion,execType] + total_detail_data.append(tmp_data) + total_real_time_proportion += real_proportion + total_detail_data = np.array(total_detail_data) + if sort_flag=="sort": + total_detail_data = sorted(total_detail_data,key=lambda tmp_data:tmp_data[-4],reverse=True) + elif sort_flag=="no_sort": + total_detail_data = total_detail_data + elif sort_flag=="simple_sort": + total_detail_data = sorted(total_detail_data,key=lambda tmp_data:tmp_data[-4],reverse=True) + total_detail_data = [tmp_data for tmp_data in total_detail_data if str(tmp_data[1])!="Status.NOT_RUN"] + print_detail_result(total_detail_data) + print(f'Total time: {total_time} microseconds') + print(f'Total CPU time: {total_time_cpu} microseconds') + print(f'Total proportion: {"%.2f"%(round(total_real_time_proportion)*100)} % \n') + return total_detail_data + +def print_detail_result(result_list): + """ Print_perf_counters_sort result + """ + max_layer_name = 30 + for tmp_result in result_list: + node_name = tmp_result[0] + layerStatus = tmp_result[1] + layerType = tmp_result[2] + real_time = tmp_result[3] + cpu_time = tmp_result[4] + real_proportion = "%.2f"%(tmp_result[5]*100) + if real_proportion == "0.00": + real_proportion = "-nan" + execType = tmp_result[6] + print(f"{node_name[:max_layer_name - 4] + '...' if (len(node_name) >= max_layer_name) else node_name:<30}" + f"{str(layerStatus):<20}" + f"{'layerType: ' + layerType:<30}" + f"{'realTime: ' + str(real_time):<20}" + f"{'cpu: ' + str(cpu_time):<15}" + f"{'proportion: '+ str(real_proportion)+'%':<20}" + f"{'execType: ' + execType:<20}") def print_perf_counters(perf_counts_list): max_layer_name = 30