From 152511daa8ce5612c035191a831faec5c8029a57 Mon Sep 17 00:00:00 2001
From: 18582088138 <752038@qq.com>
Date: Wed, 19 Oct 2022 22:59:12 +0800
Subject: [PATCH] Benchmark sort opt (#12350)

* Benchmark_app python and c/c++ script add op profiling feature

* Fix compile bug about benchmark c/c++ op profiling

* Fix Compile Bug

* Fix issue in PR#12350 : 1. report_type info update; 2. priority of -pc and -pcsort; 3.align format between c++ and python; 4.check 'proportion' for 0 and print N/A; 5.detect the confusing print info

* Fix 8/17 review suggestion

* Fix 8/23 suggestion

* Fix the clang-format issue

* Fix the win-cc issue

* Fix win-cc issue about conversion from 'double' to 'float', possible loss of data

* Push PR by ODT group account

Co-authored-by: Chen Peter <peter.chen@intel.com>
Co-authored-by: Fiona Zhao <fiona.zhao@intel.com>
---
 samples/cpp/benchmark_app/benchmark_app.hpp   |  11 +
 samples/cpp/benchmark_app/main.cpp            |  26 ++-
 .../cpp/benchmark_app/statistics_report.cpp   |  94 ++++++++
 .../cpp/benchmark_app/statistics_report.hpp   |   9 +
 .../common/utils/include/samples/common.hpp   | 205 ++++++++++++++++++
 .../openvino/tools/benchmark/main.py          |  15 +-
 .../openvino/tools/benchmark/parameters.py    |   6 +
 .../benchmark/utils/statistics_report.py      |  23 +-
 .../openvino/tools/benchmark/utils/utils.py   |  62 ++++++
 9 files changed, 444 insertions(+), 7 deletions(-)
 mode change 100644 => 100755 samples/cpp/benchmark_app/main.cpp
diff --git a/samples/cpp/benchmark_app/benchmark_app.hpp b/samples/cpp/benchmark_app/benchmark_app.hpp
index d1d76fe3c8a..d02f04c9e02 100644
--- a/samples/cpp/benchmark_app/benchmark_app.hpp
+++ b/samples/cpp/benchmark_app/benchmark_app.hpp
@@ -157,6 +157,13 @@ static const char progress_message[] =
 // @brief message for performance counters option
 static const char pc_message[] = "Optional. Report performance counters.";
 
+// @brief message for sorted performance counters option
+static const char pc_sort_message[] =
+    "Optional. Report performance counters and analysis the sort hotpoint opts. "
+    " \"sort\" Analysis opts time cost, print by hotpoint order "
+    " \"no_sort\" Analysis opts time cost, print by normal order "
+    " \"simple_sort\" Analysis opts time cost, only print EXECUTED opts by normal order";
+
 // @brief message for performance counters for sequence option
 static const char pcseq_message[] = "Optional. Report latencies for each shape in -data_shape sequence.";
 
@@ -315,6 +322,9 @@ DEFINE_bool(progress, false, progress_message);
 /// @brief Define flag for showing performance counters <br>
 DEFINE_bool(pc, false, pc_message);
 
+/// @brief Define flag for showing sorted performance counters <br>
+DEFINE_string(pcsort, "", pc_sort_message);
+
 /// @brief Define flag for showing performance sequence counters <br>
 DEFINE_bool(pcseq, false, pcseq_message);
 
@@ -411,6 +421,7 @@ static void show_usage() {
     std::cout << "    -json_stats               " << json_stats_message << std::endl;
     std::cout << "    -exec_graph_path          " << exec_graph_path_message << std::endl;
     std::cout << "    -pc                       " << pc_message << std::endl;
+    std::cout << "    -pcsort                   " << pc_sort_message << std::endl;
     std::cout << "    -pcseq                    " << pcseq_message << std::endl;
     std::cout << "    -dump_config              " << dump_config_message << std::endl;
     std::cout << "    -load_config              " << load_config_message << std::endl;
diff --git a/samples/cpp/benchmark_app/main.cpp b/samples/cpp/benchmark_app/main.cpp
old mode 100644
new mode 100755
index 71b8a0fd91d..85e84c7cdc9
--- a/samples/cpp/benchmark_app/main.cpp
+++ b/samples/cpp/benchmark_app/main.cpp
@@ -66,9 +66,9 @@ bool parse_and_check_command_line(int argc, char* argv[]) {
                                "(those options can be used in OpenVINO together), but a benchmark_app UI rule.");
     }
     if (!FLAGS_report_type.empty() && FLAGS_report_type != noCntReport && FLAGS_report_type != averageCntReport &&
-        FLAGS_report_type != detailedCntReport) {
+        FLAGS_report_type != detailedCntReport && FLAGS_report_type != sortDetailedCntReport) {
         std::string err = "only " + std::string(noCntReport) + "/" + std::string(averageCntReport) + "/" +
-                          std::string(detailedCntReport) +
+                          std::string(detailedCntReport) + "/" + std::string(sortDetailedCntReport) +
                           " report types are supported (invalid -report_type option value)";
         throw std::logic_error(err);
     }
@@ -77,6 +77,12 @@ bool parse_and_check_command_line(int argc, char* argv[]) {
         throw std::logic_error("only " + std::string(detailedCntReport) + " report type is supported for MULTI device");
     }
 
+    if (!FLAGS_pcsort.empty() && FLAGS_pcsort != "sort" && FLAGS_pcsort != "no_sort" && FLAGS_pcsort != "simple_sort") {
+        std::string pcsort_err = std::string("Incorrect performance count sort . Please set -pcsort option to ") +
+                                 std::string("'sort', 'no_sort', 'simple_sort'.");
+        throw std::logic_error(pcsort_err);
+    }
+
     bool isNetworkCompiled = fileExt(FLAGS_m) == "blob";
     bool isPrecisionSet = !(FLAGS_ip.empty() && FLAGS_op.empty() && FLAGS_iop.empty());
     if (isNetworkCompiled && isPrecisionSet) {
@@ -304,7 +310,8 @@ int main(int argc, char* argv[]) {
                        (device_config.at(ov::enable_profiling.name()).as<bool>())) {
                 slog::warn << "Performance counters for " << device
                            << " device is turned on. To print results use -pc option." << slog::endl;
-            } else if (FLAGS_report_type == detailedCntReport || FLAGS_report_type == averageCntReport) {
+            } else if (FLAGS_report_type == detailedCntReport || FLAGS_report_type == averageCntReport ||
+                       FLAGS_report_type == sortDetailedCntReport) {
                 slog::warn << "Turn on performance counters for " << device << " device since report type is "
                            << FLAGS_report_type << "." << slog::endl;
                 device_config.emplace(ov::enable_profiling(true));
@@ -312,6 +319,10 @@ int main(int argc, char* argv[]) {
                 slog::warn << "Turn on performance counters for " << device << " device due to execution graph dumping."
                            << slog::endl;
                 device_config.emplace(ov::enable_profiling(true));
+            } else if (!FLAGS_pcsort.empty()) {
+                slog::warn << "Turn on sorted performance counters for " << device << " device since pcsort is"
+                           << FLAGS_pcsort << "." << slog::endl;
+                device_config.emplace(ov::enable_profiling(true));
             } else {
                 // set to default value
                 device_config.emplace(ov::enable_profiling(FLAGS_pc));
@@ -1171,7 +1182,14 @@ int main(int argc, char* argv[]) {
             std::vector<std::vector<ov::ProfilingInfo>> perfCounts;
             for (size_t ireq = 0; ireq < nireq; ireq++) {
                 auto reqPerfCounts = inferRequestsQueue.requests[ireq]->get_performance_counts();
-                if (FLAGS_pc) {
+                if (!FLAGS_pcsort.empty()) {
+                    slog::info << "Sort performance counts for " << ireq << "-th infer request:" << slog::endl;
+                    printPerformanceCountsSort(reqPerfCounts,
+                                               std::cout,
+                                               getFullDeviceName(core, FLAGS_d),
+                                               FLAGS_pcsort,
+                                               false);
+                } else if (FLAGS_pc) {
                     slog::info << "Performance counts for " << ireq << "-th infer request:" << slog::endl;
                     printPerformanceCounts(reqPerfCounts, std::cout, getFullDeviceName(core, FLAGS_d), false);
                 }
diff --git a/samples/cpp/benchmark_app/statistics_report.cpp b/samples/cpp/benchmark_app/statistics_report.cpp
index 89f8a74e747..c8c89ac07b7 100644
--- a/samples/cpp/benchmark_app/statistics_report.cpp
+++ b/samples/cpp/benchmark_app/statistics_report.cpp
@@ -93,6 +93,7 @@ void StatisticsReport::dump_performance_counters_request(CsvDumper& dumper, cons
         total_cpu += layer.cpu_time;
         dumper.endLine();
     }
+
     dumper << "Total"
            << ""
            << ""
@@ -102,6 +103,55 @@ void StatisticsReport::dump_performance_counters_request(CsvDumper& dumper, cons
     dumper.endLine();
 }
 
+void StatisticsReport::dump_sort_performance_counters_request(CsvDumper& dumper,
+                                                              const PerformanceCounters& perfCounts) {
+    std::chrono::microseconds total = std::chrono::microseconds::zero();
+    std::chrono::microseconds total_cpu = std::chrono::microseconds::zero();
+    int layersize = 0;
+
+    dumper << "layerName"
+           << "execStatus"
+           << "layerType"
+           << "execType";
+    dumper << "realTime (ms)"
+           << "cpuTime (ms)"
+           << " %";
+    dumper.endLine();
+
+    for (const auto& layer : perfCounts) {
+        if (status_names[(int)layer.status] == "EXECUTED") {
+            total += layer.real_time;
+            total_cpu += layer.cpu_time;
+        }
+    }
+
+    // sort perfcounter
+    std::vector<ov::ProfilingInfo> profiling{std::begin(perfCounts), std::end(perfCounts)};
+    std::sort(profiling.begin(), profiling.end(), sort_profiling_descend);
+    for (const auto& layer : profiling) {
+        if (status_names[(int)layer.status] == "EXECUTED") {
+            dumper << layer.node_name;  // layer name
+            dumper << ((int)layer.status < (sizeof(status_names) / sizeof(status_names[0]))
+                           ? status_names[(int)layer.status]
+                           : "INVALID_STATUS");
+            dumper << layer.node_type << layer.exec_type;
+            dumper << std::to_string(layer.real_time.count() / 1000.0)
+                   << std::to_string(layer.cpu_time.count() / 1000.0);
+            dumper << (layer.real_time * 1.0 / total) * 100;
+            dumper.endLine();
+            layersize += 1;
+        }
+    }
+
+    dumper << "Total"
+           << ""
+           << ""
+           << "";
+    dumper << total.count() / 1000.0 << total_cpu.count() / 1000.0 << 100.0;
+    dumper.endLine();
+    dumper.endLine();
+}
+
 StatisticsReport::PerformanceCounters StatisticsReport::get_average_performance_counters(
     const std::vector<PerformanceCounters>& perfCounts) {
     StatisticsReport::PerformanceCounters performanceCountersAvg;
@@ -148,6 +198,10 @@ void StatisticsReport::dump_performance_counters(const std::vector<PerformanceCo
         }
     } else if (_config.report_type == averageCntReport) {
         dump_performance_counters_request(dumper, get_average_performance_counters(perfCounts));
+    } else if (_config.report_type == sortDetailedCntReport) {
+        for (auto& pc : perfCounts) {
+            dump_sort_performance_counters_request(dumper, pc);
+        }
     } else {
         throw std::logic_error("PM data can only be collected for average or detailed report types");
     }
@@ -206,6 +260,10 @@ void StatisticsReportJSON::dump_performance_counters(const std::vector<Performan
     } else if (_config.report_type == averageCntReport) {
         js["report_type"] = "average";
         js["avg_performance"] = perf_counters_to_json(get_average_performance_counters(perfCounts));
+    } else if (_config.report_type == sortDetailedCntReport) {
+        for (auto& pc : perfCounts) {
+            js["detailed_performance"].push_back(sort_perf_counters_to_json(pc));
+        }
     } else {
         throw std::logic_error("PM data can only be collected for average or detailed report types");
     }
@@ -242,6 +300,42 @@ const nlohmann::json StatisticsReportJSON::perf_counters_to_json(
     return js;
 }
 
+const nlohmann::json StatisticsReportJSON::sort_perf_counters_to_json(
+    const StatisticsReport::PerformanceCounters& perfCounts) {
+    std::chrono::microseconds total = std::chrono::microseconds::zero();
+    std::chrono::microseconds total_cpu = std::chrono::microseconds::zero();
+
+    nlohmann::json js;
+    js["nodes"] = nlohmann::json::array();
+
+    for (const auto& layer : perfCounts) {
+        total += layer.real_time;
+        total_cpu += layer.cpu_time;
+    }
+
+    // sort perfcounter
+    std::vector<ov::ProfilingInfo> sortPerfCounts{std::begin(perfCounts), std::end(perfCounts)};
+    std::sort(sortPerfCounts.begin(), sortPerfCounts.end(), sort_profiling_descend);
+
+    for (const auto& layer : sortPerfCounts) {
+        nlohmann::json item;
+        item["name"] = layer.node_name;  // layer name
+        item["status"] =
+            ((int)layer.status < (sizeof(status_names) / sizeof(status_names[0])) ? status_names[(int)layer.status]
+                                                                                  : "INVALID_STATUS");
+        item["node_type"] = layer.node_type;
+        item["exec_type"] = layer.exec_type;
+        item["real_time"] = layer.real_time.count() / 1000.0;
+        item["cpu_time"] = layer.cpu_time.count() / 1000.0;
+        item["%"] = std::round(layer.real_time * 10000.0 / total) / 100;
+        js["nodes"].push_back(item);
+    }
+
+    js["total_real_time"] = total.count() / 1000.0;
+    js["total_cpu_time"] = total_cpu.count() / 1000.0;
+    return js;
+}
+
 void LatencyMetrics::write_to_stream(std::ostream& stream) const {
     std::ios::fmtflags fmt(std::cout.flags());
     stream << data_shape << ";" << std::fixed << std::setprecision(2) << median_or_percentile << ";" << avg << ";"
diff --git a/samples/cpp/benchmark_app/statistics_report.hpp b/samples/cpp/benchmark_app/statistics_report.hpp
index d54d46e58b2..cd0128fc377 100644
--- a/samples/cpp/benchmark_app/statistics_report.hpp
+++ b/samples/cpp/benchmark_app/statistics_report.hpp
@@ -22,6 +22,7 @@
 static constexpr char noCntReport[] = "no_counters";
 static constexpr char averageCntReport[] = "average_counters";
 static constexpr char detailedCntReport[] = "detailed_counters";
+static constexpr char sortDetailedCntReport[] = "sort_detailed_counters";
 
 /// @brief Responsible for calculating different latency metrics
 class LatencyMetrics {
@@ -143,6 +144,10 @@ public:
 
 private:
     void dump_performance_counters_request(CsvDumper& dumper, const PerformanceCounters& perfCounts);
+    void dump_sort_performance_counters_request(CsvDumper& dumper, const PerformanceCounters& perfCounts);
+    static bool sort_profiling_descend(const ov::ProfilingInfo& profiling1, const ov::ProfilingInfo& profiling2) {
+        return profiling1.real_time > profiling2.real_time;
+    }
 
 protected:
     // configuration of current benchmark execution
@@ -168,4 +173,8 @@ public:
 private:
     void dump_parameters(nlohmann::json& js, const StatisticsReport::Parameters& parameters);
     const nlohmann::json perf_counters_to_json(const StatisticsReport::PerformanceCounters& perfCounts);
+    const nlohmann::json sort_perf_counters_to_json(const StatisticsReport::PerformanceCounters& perfCounts);
+    static bool sort_profiling_descend(const ov::ProfilingInfo& profiling1, const ov::ProfilingInfo& profiling2) {
+        return profiling1.real_time > profiling2.real_time;
+    }
 };
diff --git a/samples/cpp/common/utils/include/samples/common.hpp b/samples/cpp/common/utils/include/samples/common.hpp
index 1dbfd43f917..fe4ff5b610b 100644
--- a/samples/cpp/common/utils/include/samples/common.hpp
+++ b/samples/cpp/common/utils/include/samples/common.hpp
@@ -23,11 +23,18 @@
 #include <utility>
 #include <vector>
 
+using std::setprecision;
+
 // clang-format off
 #include "openvino/openvino.hpp"
 #include "slog.hpp"
 // clang-format on
 
+// @brief performance counters sort
+static constexpr char pcSort[] = "sort";
+static constexpr char pcNoSort[] = "no_sort";
+static constexpr char pcSimpleSort[] = "simple_sort";
+
 #ifndef UNUSED
 #    if defined(_MSC_VER) && !defined(__clang__)
 #        define UNUSED
@@ -1048,3 +1055,201 @@ static UNUSED void printPerformanceCounts(ov::InferRequest request,
     auto performanceMap = request.get_profiling_info();
     printPerformanceCounts(performanceMap, stream, deviceName, bshowHeader);
 }
+
+static UNUSED void printPerformanceCountsNoSort(std::vector<ov::ProfilingInfo> performanceData,
+                                                std::ostream& stream,
+                                                std::string deviceName,
+                                                bool bshowHeader = true) {
+    std::chrono::microseconds totalTime = std::chrono::microseconds::zero();
+    // Print performance counts
+    if (bshowHeader) {
+        stream << std::endl << "performance counts:" << std::endl << std::endl;
+    }
+    std::ios::fmtflags fmt(std::cout.flags());
+
+    for (const auto& it : performanceData) {
+        if (it.real_time.count() > 0) {
+            totalTime += it.real_time;
+        }
+    }
+    if (totalTime.count() != 0) {
+        for (const auto& it : performanceData) {
+            std::string toPrint(it.node_name);
+            const int maxLayerName = 30;
+
+            if (it.node_name.length() >= maxLayerName) {
+                toPrint = it.node_name.substr(0, maxLayerName - 4);
+                toPrint += "...";
+            }
+
+            stream << std::setw(maxLayerName) << std::left << toPrint;
+            switch (it.status) {
+            case ov::ProfilingInfo::Status::EXECUTED:
+                stream << std::setw(15) << std::left << "EXECUTED";
+                break;
+            case ov::ProfilingInfo::Status::NOT_RUN:
+                stream << std::setw(15) << std::left << "NOT_RUN";
+                break;
+            case ov::ProfilingInfo::Status::OPTIMIZED_OUT:
+                stream << std::setw(15) << std::left << "OPTIMIZED_OUT";
+                break;
+            }
+            stream << std::setw(30) << std::left << "layerType: " + std::string(it.node_type) + " ";
+            stream << std::setw(20) << std::left << "realTime: " + std::to_string(it.real_time.count());
+            stream << std::setw(15) << std::left << "cpu: " + std::to_string(it.cpu_time.count());
+            double opt_proportion = it.real_time.count() * 100.0 / totalTime.count();
+            std::stringstream opt_proportion_ss;
+            opt_proportion_ss << std::fixed << std::setprecision(2) << opt_proportion;
+            std::string opt_proportion_str = opt_proportion_ss.str();
+            if (opt_proportion_str == "0.00") {
+                opt_proportion_str = "N/A";
+            }
+            stream << std::setw(20) << std::left << "proportion: " + opt_proportion_str + "%";
+            stream << " execType: " << it.exec_type << std::endl;
+        }
+    }
+    stream << std::setw(20) << std::left << "Total time: " + std::to_string(totalTime.count() / 1000.0)
+           << " microseconds" << std::endl;
+    std::cout << std::endl;
+    std::cout << "Full device name: " << deviceName << std::endl;
+    std::cout << std::endl;
+    std::cout.flags(fmt);
+}
+
+static UNUSED bool sort_pc_descend(const ov::ProfilingInfo& profiling1, const ov::ProfilingInfo& profiling2) {
+    return profiling1.real_time > profiling2.real_time;
+}
+
+static UNUSED void printPerformanceCountsDescendSort(std::vector<ov::ProfilingInfo> performanceData,
+                                                     std::ostream& stream,
+                                                     std::string deviceName,
+                                                     bool bshowHeader = true) {
+    std::chrono::microseconds totalTime = std::chrono::microseconds::zero();
+    // Print performance counts
+    if (bshowHeader) {
+        stream << std::endl << "performance counts:" << std::endl << std::endl;
+    }
+    std::ios::fmtflags fmt(std::cout.flags());
+
+    for (const auto& it : performanceData) {
+        if (it.real_time.count() > 0) {
+            totalTime += it.real_time;
+        }
+    }
+    if (totalTime.count() != 0) {
+        // sort perfcounter
+        std::vector<ov::ProfilingInfo> sortPerfCounts{std::begin(performanceData), std::end(performanceData)};
+        std::sort(sortPerfCounts.begin(), sortPerfCounts.end(), sort_pc_descend);
+
+        for (const auto& it : sortPerfCounts) {
+            std::string toPrint(it.node_name);
+            const int maxLayerName = 30;
+
+            if (it.node_name.length() >= maxLayerName) {
+                toPrint = it.node_name.substr(0, maxLayerName - 4);
+                toPrint += "...";
+            }
+
+            stream << std::setw(maxLayerName) << std::left << toPrint;
+            switch (it.status) {
+            case ov::ProfilingInfo::Status::EXECUTED:
+                stream << std::setw(15) << std::left << "EXECUTED";
+                break;
+            case ov::ProfilingInfo::Status::NOT_RUN:
+                stream << std::setw(15) << std::left << "NOT_RUN";
+                break;
+            case ov::ProfilingInfo::Status::OPTIMIZED_OUT:
+                stream << std::setw(15) << std::left << "OPTIMIZED_OUT";
+                break;
+            }
+            stream << std::setw(30) << std::left << "layerType: " + std::string(it.node_type) + " ";
+            stream << std::setw(20) << std::left << "realTime: " + std::to_string(it.real_time.count());
+            stream << std::setw(15) << std::left << "cpu: " + std::to_string(it.cpu_time.count());
+            double opt_proportion = it.real_time.count() * 100.0 / totalTime.count();
+            std::stringstream opt_proportion_ss;
+            opt_proportion_ss << std::fixed << std::setprecision(2) << opt_proportion;
+            std::string opt_proportion_str = opt_proportion_ss.str();
+            if (opt_proportion_str == "0.00") {
+                opt_proportion_str = "N/A";
+            }
+            stream << std::setw(20) << std::left << "proportion: " + opt_proportion_str + "%";
+            stream << " execType: " << it.exec_type << std::endl;
+        }
+    }
+    stream << std::setw(20) << std::left << "Total time: " + std::to_string(totalTime.count() / 1000.0)
+           << " microseconds" << std::endl;
+    std::cout << std::endl;
+    std::cout << "Full device name: " << deviceName << std::endl;
+    std::cout << std::endl;
+    std::cout.flags(fmt);
+}
+
+static UNUSED void printPerformanceCountsSimpleSort(std::vector<ov::ProfilingInfo> performanceData,
+                                                    std::ostream& stream,
+                                                    std::string deviceName,
+                                                    bool bshowHeader = true) {
+    std::chrono::microseconds totalTime = std::chrono::microseconds::zero();
+    // Print performance counts
+    if (bshowHeader) {
+        stream << std::endl << "performance counts:" << std::endl << std::endl;
+    }
+    std::ios::fmtflags fmt(std::cout.flags());
+
+    for (const auto& it : performanceData) {
+        if (it.real_time.count() > 0) {
+            totalTime += it.real_time;
+        }
+    }
+    if (totalTime.count() != 0) {
+        // sort perfcounter
+        std::vector<ov::ProfilingInfo> sortPerfCounts{std::begin(performanceData), std::end(performanceData)};
+        std::sort(sortPerfCounts.begin(), sortPerfCounts.end(), sort_pc_descend);
+
+        for (const auto& it : sortPerfCounts) {
+            if (it.status == ov::ProfilingInfo::Status::EXECUTED) {
+                std::string toPrint(it.node_name);
+                const int maxLayerName = 30;
+
+                if (it.node_name.length() >= maxLayerName) {
+                    toPrint = it.node_name.substr(0, maxLayerName - 4);
+                    toPrint += "...";
+                }
+
+                stream << std::setw(maxLayerName) << std::left << toPrint;
+                stream << std::setw(15) << std::left << "EXECUTED";
+                stream << std::setw(30) << std::left << "layerType: " + std::string(it.node_type) + " ";
+                stream << std::setw(20) << std::left << "realTime: " + std::to_string(it.real_time.count());
+                stream << std::setw(15) << std::left << "cpu: " + std::to_string(it.cpu_time.count());
+                double opt_proportion = it.real_time.count() * 100.0 / totalTime.count();
+                std::stringstream opt_proportion_ss;
+                opt_proportion_ss << std::fixed << std::setprecision(2) << opt_proportion;
+                std::string opt_proportion_str = opt_proportion_ss.str();
+                if (opt_proportion_str == "0.00") {
+                    opt_proportion_str = "N/A";
+                }
+                stream << std::setw(20) << std::left << "proportion: " + opt_proportion_str + "%";
+                stream << " execType: " << it.exec_type << std::endl;
+            }
+        }
+    }
+    stream << std::setw(20) << std::left << "Total time: " + std::to_string(totalTime.count() / 1000.0)
+           << " microseconds" << std::endl;
+    std::cout << std::endl;
+    std::cout << "Full device name: " << deviceName << std::endl;
+    std::cout << std::endl;
+    std::cout.flags(fmt);
+}
+
+static UNUSED void printPerformanceCountsSort(std::vector<ov::ProfilingInfo> performanceData,
+                                              std::ostream& stream,
+                                              std::string deviceName,
+                                              std::string sorttype,
+                                              bool bshowHeader = true) {
+    if (sorttype == pcNoSort) {
+        printPerformanceCountsNoSort(performanceData, stream, deviceName, bshowHeader);
+    } else if (sorttype == pcSort) {
+        printPerformanceCountsDescendSort(performanceData, stream, deviceName, bshowHeader);
+    } else if (sorttype == pcSimpleSort) {
+        printPerformanceCountsSimpleSort(performanceData, stream, deviceName, bshowHeader);
+    }
+}
diff --git a/tools/benchmark_tool/openvino/tools/benchmark/main.py b/tools/benchmark_tool/openvino/tools/benchmark/main.py
index 598b9cf9a0d..60fce01fa20 100644
--- a/tools/benchmark_tool/openvino/tools/benchmark/main.py
+++ b/tools/benchmark_tool/openvino/tools/benchmark/main.py
@@ -15,7 +15,7 @@ from openvino.tools.benchmark.utils.inputs_filling import get_input_data
 from openvino.tools.benchmark.utils.logging import logger
 from openvino.tools.benchmark.utils.progress_bar import ProgressBar
 from openvino.tools.benchmark.utils.utils import next_step, get_number_iterations, pre_post_processing, \
-    process_help_inference_string, print_perf_counters, dump_exec_graph, get_duration_in_milliseconds, \
+    process_help_inference_string, print_perf_counters, print_perf_counters_sort, dump_exec_graph, get_duration_in_milliseconds, \
     get_command_line_arguments, parse_value_per_device, parse_devices, get_inputs_info, \
     print_inputs_and_outputs_info, get_network_batch_size, load_config, dump_config, get_latency_groups, \
     check_for_static, can_measure_as_static
@@ -156,6 +156,9 @@ def main():
                 logger.warning(f"Turn on performance counters for {device} device " +
                                "due to execution graph dumping.")
                 config[device]['PERF_COUNT'] = 'YES'
+            elif is_flag_set_in_command_line('pcsort'):
+                ## set to default value
+                config[device]['PERF_COUNT'] = 'YES' if args.perf_counts_sort else 'NO'
             else:
                 ## set to default value
                 config[device]['PERF_COUNT'] = 'YES' if args.perf_counts else 'NO'
@@ -500,9 +503,17 @@ def main():
             perfs_count_list = []
             for request in requests:
                 perfs_count_list.append(request.profiling_info)
-            if args.perf_counts:
+
+            if args.perf_counts_sort:
+                total_sorted_list = print_perf_counters_sort(perfs_count_list,sort_flag=args.perf_counts_sort)
+                if statistics:
+                    statistics.dump_performance_counters_sorted(total_sorted_list)
+            
+            elif args.perf_counts:
                 print_perf_counters(perfs_count_list)
+
             if statistics:
+                # if not args.perf_counts_sort:
                 statistics.dump_performance_counters(perfs_count_list)
 
         if statistics:
diff --git a/tools/benchmark_tool/openvino/tools/benchmark/parameters.py b/tools/benchmark_tool/openvino/tools/benchmark/parameters.py
index fffee81bf6c..4516e258c6d 100644
--- a/tools/benchmark_tool/openvino/tools/benchmark/parameters.py
+++ b/tools/benchmark_tool/openvino/tools/benchmark/parameters.py
@@ -114,6 +114,12 @@ def parse_args():
                       help='Optional. Path to a file where to store executable graph information serialized.')
     args.add_argument('-pc', '--perf_counts', type=str2bool, required=False, default=False, nargs='?', const=True,
                       help='Optional. Report performance counters.', )
+    args.add_argument('-pcsort', '--perf_counts_sort', type=str, required=False, default="",
+                      choices=['no_sort', 'sort', 'simple_sort'],
+                      help='Optional. Report performance counters and analysis the sort hotpoint opts.'
+                           '  sort: Analysis opts time cost, print by hotpoint order'
+                           '  no_sort: Analysis opts time cost, print by normal order'
+                           '  simple_sort: Analysis opts time cost, only print EXECUTED opts by normal order', )
     args.add_argument('-pcseq', '--pcseq', type=str2bool, required=False, default=False, nargs='?', const=True,
                       help='Optional. Report latencies for each shape in -data_shape sequence.', )
     args.add_argument('-inference_only', '--inference_only', type=str2bool, required=False, default=None, nargs='?', const=True,
diff --git a/tools/benchmark_tool/openvino/tools/benchmark/utils/statistics_report.py b/tools/benchmark_tool/openvino/tools/benchmark/utils/statistics_report.py
index dcac35f36f8..1909335266d 100644
--- a/tools/benchmark_tool/openvino/tools/benchmark/utils/statistics_report.py
+++ b/tools/benchmark_tool/openvino/tools/benchmark/utils/statistics_report.py
@@ -5,7 +5,7 @@ from datetime import timedelta
 import os
 import sys
 from enum import Enum
-
+import csv
 from .logging import logger
 
 ## statistics reports types
@@ -75,6 +75,27 @@ class StatisticsReport:
             str((total_cpu // timedelta(microseconds=1))/1000.0)]))
         f.write('\n\n')
 
+    def dump_performance_counters_sorted(self,prof_sorted_info):
+        """Save sorted performance counters into csv file.
+        """
+        filename = os.path.join(self.config.report_folder, f'benchmark_sorted_report.csv')
+        total = 0
+        total_cpu = 0
+        with open(filename, 'w') as f:
+            writer = csv.writer(f)
+            writer.writerow(['LayerName', 'ExecStatus', 'LayerType', 'ExecType', 'RealTime (ms)', 'CPUTime (ms)' , 'Proportion(%) \n'])
+            for tmp_prof in prof_sorted_info:
+                writer.writerow([tmp_prof[0], str(tmp_prof[1]), 
+                                    tmp_prof[2], tmp_prof[6], 
+                                    str(tmp_prof[3]), str(tmp_prof[4]),
+                                    str("%.2f"%(tmp_prof[5]*100))+"%"])
+                total += tmp_prof[3]
+                total_cpu += tmp_prof[4]
+            f.write('\n') 
+            writer.writerow(["Total time cost: RealTime %.2fms , CPUTime %.2fms"%(total,total_cpu)])
+            f.write('\n\n')            
+        logger.info(f'Sorted performance counters report is stored to {filename}')
+
     def dump_performance_counters(self, prof_info_list):
         if self.config.report_type == '' or self.config.report_type == noCntReport:
             logger.info("Statistics collecting for performance counters was not requested. No reports are dumped.")
diff --git a/tools/benchmark_tool/openvino/tools/benchmark/utils/utils.py b/tools/benchmark_tool/openvino/tools/benchmark/utils/utils.py
index d0d409cbb89..7a97c035140 100644
--- a/tools/benchmark_tool/openvino/tools/benchmark/utils/utils.py
+++ b/tools/benchmark_tool/openvino/tools/benchmark/utils/utils.py
@@ -309,6 +309,68 @@ def process_help_inference_string(benchmark_app, device_number_streams):
 def dump_exec_graph(compiled_model, model_path):
     serialize(compiled_model.get_runtime_model(), model_path)
 
+def print_perf_counters_sort(perf_counts_list,sort_flag="sort"):
+    """ Print opts time cost and can be sorted according by each opts time cost
+    """
+    for ni in range(len(perf_counts_list)):
+        perf_counts = perf_counts_list[ni]
+        total_time = datetime.timedelta()
+        total_time_cpu = datetime.timedelta()
+        logger.info(f"Performance counts sorted for {ni}-th infer request")
+        for pi in perf_counts:
+            total_time += pi.real_time
+            total_time_cpu += pi.cpu_time
+
+        total_time = total_time.microseconds
+        total_time_cpu = total_time_cpu.microseconds
+        total_real_time_proportion = 0
+        total_detail_data=[]
+        for pi in perf_counts:
+            node_name = pi.node_name
+            layerStatus = pi.status
+            layerType = pi.node_type
+            real_time = pi.real_time.microseconds
+            cpu_time = pi.cpu_time.microseconds
+            real_proportion = round(real_time/total_time,4)
+            execType = pi.exec_type
+            tmp_data=[node_name,layerStatus,layerType,real_time,cpu_time,real_proportion,execType]
+            total_detail_data.append(tmp_data)
+            total_real_time_proportion += real_proportion
+        total_detail_data = np.array(total_detail_data)
+        if sort_flag=="sort":
+            total_detail_data = sorted(total_detail_data,key=lambda tmp_data:tmp_data[-4],reverse=True)
+        elif sort_flag=="no_sort":
+            total_detail_data = total_detail_data
+        elif sort_flag=="simple_sort":
+            total_detail_data = sorted(total_detail_data,key=lambda tmp_data:tmp_data[-4],reverse=True)
+            total_detail_data = [tmp_data for tmp_data in total_detail_data if str(tmp_data[1])!="Status.NOT_RUN"]
+        print_detail_result(total_detail_data)        
+        print(f'Total time:       {total_time} microseconds')
+        print(f'Total CPU time:   {total_time_cpu} microseconds')
+        print(f'Total proportion: {"%.2f"%(round(total_real_time_proportion)*100)} % \n')
+    return total_detail_data
+
+def print_detail_result(result_list):
+    """ Print_perf_counters_sort result 
+    """
+    max_layer_name = 30
+    for tmp_result in result_list:
+        node_name = tmp_result[0]
+        layerStatus = tmp_result[1]
+        layerType = tmp_result[2]
+        real_time = tmp_result[3]
+        cpu_time = tmp_result[4]
+        real_proportion = "%.2f"%(tmp_result[5]*100)
+        if real_proportion == "0.00":
+            real_proportion = "-nan"
+        execType = tmp_result[6]
+        print(f"{node_name[:max_layer_name - 4] + '...' if (len(node_name) >= max_layer_name) else node_name:<30}"
+            f"{str(layerStatus):<20}"
+            f"{'layerType: ' + layerType:<30}"
+            f"{'realTime: ' + str(real_time):<20}"
+            f"{'cpu: ' +  str(cpu_time):<15}"
+            f"{'proportion: '+ str(real_proportion)+'%':<20}"
+            f"{'execType: ' + execType:<20}")
 
 def print_perf_counters(perf_counts_list):
     max_layer_name = 30