Intoduce -latency_percentile flag for the benchmark_app tool (#6479)

* Introduce new -latency_percentile flag for benchmark_app * Fix syntax
2021-07-23 10:29:55 +03:00
parent ac0f3c5271
commit 753fdaee98
6 changed files with 50 additions and 13 deletions
--- a/inference-engine/samples/benchmark_app/README.md
+++ b/inference-engine/samples/benchmark_app/README.md
@@ -95,6 +95,7 @@ Options:
    -layout                     Optional. Prompts how network layouts should be treated by application. For example, "input1[NCHW],input2[NC]" or "[NCHW]" in case of one input size.
    -cache_dir "<path>"         Optional. Enables caching of loaded models to specified directory.
    -load_from_file             Optional. Loads model from file directly without ReadNetwork.
+    -latency_percentile         Optional. Defines the percentile to be reported in latency metric. The valid range is [1, 100]. The default value is 50 (median).

  CPU-specific performance options:
    -nstreams "<integer>"       Optional. Number of streams to use for inference on the CPU, GPU or MYRIAD devices
--- a/inference-engine/samples/benchmark_app/benchmark_app.hpp
+++ b/inference-engine/samples/benchmark_app/benchmark_app.hpp
@@ -56,6 +56,10 @@ static const char infer_num_streams_message[] = "Optional. Number of streams to
                                                "Also, using nstreams>1 is inherently throughput-oriented option, "
                                                "while for the best-latency estimations the number of streams should be set to 1.";

+/// @brief message for latency percentile settings
+static const char infer_latency_percentile_message[] =
+    "Optional. Defines the percentile to be reported in latency metric. The valid range is [1, 100]. The default value is 50 (median).";
+
 /// @brief message for enforcing of BF16 execution where it is possible
 static const char enforce_bf16_message[] = "Optional. By default floating point operations execution in bfloat16 precision are enforced "
                                           "if supported by platform.\n"
@@ -189,6 +193,9 @@ DEFINE_uint32(nthreads, 0, infer_num_threads_message);
 /// @brief Number of streams to use for inference on the CPU (also affects Hetero cases)
 DEFINE_string(nstreams, "", infer_num_streams_message);

+/// @brief The percentile which will be reported in latency metric
+DEFINE_uint32(latency_percentile, 50, infer_latency_percentile_message);
+
 /// @brief Enforces bf16 execution with bfloat16 precision on systems having this capability
 DEFINE_bool(enforcebf16, false, enforce_bf16_message);

@@ -278,6 +285,7 @@ static void showUsage() {
    std::cout << "    -layout                   " << layout_message << std::endl;
    std::cout << "    -cache_dir \"<path>\"        " << cache_dir_message << std::endl;
    std::cout << "    -load_from_file           " << load_from_file_message << std::endl;
+    std::cout << "    -latency_percentile       " << infer_latency_percentile_message << std::endl;
    std::cout << std::endl << "  device-specific performance options:" << std::endl;
    std::cout << "    -nstreams \"<integer>\"     " << infer_num_streams_message << std::endl;
    std::cout << "    -nthreads \"<integer>\"     " << infer_num_threads_message << std::endl;
--- a/inference-engine/samples/benchmark_app/main.cpp
+++ b/inference-engine/samples/benchmark_app/main.cpp
@@ -52,6 +52,10 @@ bool ParseAndCheckCommandLine(int argc, char* argv[]) {
        throw std::logic_error("Model is required but not set. Please set -m option.");
    }

+    if (FLAGS_latency_percentile > 100 || FLAGS_latency_percentile < 1) {
+        showUsage();
+        throw std::logic_error("The percentile value is incorrect. The applicable values range is [1, 100].");
+    }
    if (FLAGS_api != "async" && FLAGS_api != "sync") {
        throw std::logic_error("Incorrect API. Please set -api option to `sync` or `async` value.");
    }
@@ -100,11 +104,10 @@ static void next_step(const std::string additional_info = "") {
 }

 template <typename T>
-T getMedianValue(const std::vector<T>& vec) {
+T getMedianValue(const std::vector<T>& vec, std::size_t percentile) {
    std::vector<T> sortedVec(vec);
    std::sort(sortedVec.begin(), sortedVec.end());
-    return (sortedVec.size() % 2 != 0) ? sortedVec[sortedVec.size() / 2ULL]
-                                       : (sortedVec[sortedVec.size() / 2ULL] + sortedVec[sortedVec.size() / 2ULL - 1ULL]) / static_cast<T>(2.0);
+    return sortedVec[(sortedVec.size() / 100) * percentile];
 }

 /**
@@ -624,7 +627,7 @@ int main(int argc, char* argv[]) {
        // wait the latest inference executions
        inferRequestsQueue.waitAll();

-        double latency = getMedianValue<double>(inferRequestsQueue.getLatencies());
+        double latency = getMedianValue<double>(inferRequestsQueue.getLatencies(), FLAGS_latency_percentile);
        double totalDuration = inferRequestsQueue.getDurationInMilliseconds();
        double fps = (FLAGS_api == "sync") ? batchSize * 1000.0 / latency : batchSize * 1000.0 * iteration / totalDuration;

@@ -634,8 +637,14 @@ int main(int argc, char* argv[]) {
                                                                                         {"total number of iterations", std::to_string(iteration)},
                                                                                     });
            if (device_name.find("MULTI") == std::string::npos) {
+                std::string latency_label;
+                if (FLAGS_latency_percentile == 50) {
+                    latency_label = "latency (ms)";
+                } else {
+                    latency_label = "latency (" + std::to_string(FLAGS_latency_percentile) + " percentile) (ms)";
+                }
                statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, {
-                                                                                             {"latency (ms)", double_to_string(latency)},
+                                                                                             {latency_label, double_to_string(latency)},
                                                                                         });
            }
            statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, {{"throughput", double_to_string(fps)}});
@@ -684,8 +693,15 @@ int main(int argc, char* argv[]) {

        std::cout << "Count:      " << iteration << " iterations" << std::endl;
        std::cout << "Duration:   " << double_to_string(totalDuration) << " ms" << std::endl;
-        if (device_name.find("MULTI") == std::string::npos)
-            std::cout << "Latency:    " << double_to_string(latency) << " ms" << std::endl;
+        if (device_name.find("MULTI") == std::string::npos) {
+            std::cout << "Latency";
+            if (FLAGS_latency_percentile == 50) {
+                std::cout << ":    ";
+            } else {
+                std::cout << " (" << FLAGS_latency_percentile << " percentile):    ";
+            }
+            std::cout << double_to_string(latency) << " ms" << std::endl;
+        }
        std::cout << "Throughput: " << double_to_string(fps) << " FPS" << std::endl;
    } catch (const std::exception& ex) {
        slog::err << ex.what() << slog::endl;
--- a/tools/benchmark_tool/openvino/tools/benchmark/benchmark.py
+++ b/tools/benchmark_tool/openvino/tools/benchmark/benchmark.py
@@ -3,7 +3,7 @@

 import os
 from datetime import datetime
-from statistics import median
+from math import ceil
 from openvino.inference_engine import IENetwork, IECore, get_version, StatusCode

 from .utils.constants import MULTI_DEVICE_NAME, HETERO_DEVICE_NAME, CPU_DEVICE_NAME, GPU_DEVICE_NAME, XML_EXTENSION, BIN_EXTENSION
@@ -11,6 +11,9 @@ from .utils.logging import logger
 from .utils.utils import get_duration_seconds
 from .utils.statistics_report import StatisticsReport

+def percentile(values, percent):
+    return values[ceil(len(values) * percent / 100) - 1]
+
 class Benchmark:
    def __init__(self, device: str, number_infer_requests: int = None, number_iterations: int = None,
                 duration_seconds: int = None, api_type: str = 'async'):
@@ -98,7 +101,7 @@ class Benchmark:
                raise Exception(f"Wait for all requests is failed with status code {status}!")
        return infer_request.latency

-    def infer(self, exe_network, batch_size, progress_bar=None):
+    def infer(self, exe_network, batch_size, latency_percentile, progress_bar=None):
        progress_count = 0
        infer_requests = exe_network.requests

@@ -155,7 +158,7 @@ class Benchmark:
        for infer_request_id in in_fly:
            times.append(infer_requests[infer_request_id].latency)
        times.sort()
-        latency_ms = median(times)
+        latency_ms = percentile(times, latency_percentile)
        fps = batch_size * 1000 / latency_ms if self.api_type == 'sync' else batch_size * iteration / total_duration_sec
        if progress_bar:
            progress_bar.finish()
--- a/tools/benchmark_tool/openvino/tools/benchmark/main.py
+++ b/tools/benchmark_tool/openvino/tools/benchmark/main.py
@@ -344,7 +344,7 @@ def run(args):
                                    [
                                        ('first inference time (ms)', duration_ms)
                                    ])
-        fps, latency_ms, total_duration_sec, iteration = benchmark.infer(exe_network, batch_size, progress_bar)
+        fps, latency_ms, total_duration_sec, iteration = benchmark.infer(exe_network, batch_size, args.latency_percentile, progress_bar)

        # ------------------------------------ 11. Dumping statistics report -------------------------------------------
        next_step()
@@ -372,9 +372,13 @@ def run(args):
                                          ('total number of iterations', str(iteration)),
                                      ])
            if MULTI_DEVICE_NAME not in device_name:
+                if args.latency_percentile == 50:
+                    latency_prefix = 'latency (ms)'
+                else:
+                    latency_prefix = 'latency (' + args.latency_percentile + ' percentile) (ms)'
                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
                                          [
-                                              ('latency (ms)', f'{latency_ms:.2f}'),
+                                              (latency_prefix, f'{latency_ms:.2f}'),
                                          ])

            statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
@@ -388,7 +392,10 @@ def run(args):
        print(f'Count:      {iteration} iterations')
        print(f'Duration:   {get_duration_in_milliseconds(total_duration_sec):.2f} ms')
        if MULTI_DEVICE_NAME not in device_name:
-            print(f'Latency:    {latency_ms:.2f} ms')
+            if args.latency_percentile == 50:
+                print(f'Latency:    {latency_ms:.2f} ms')
+            else:
+                print(f'Latency ({args.latency_percentile} percentile):    {latency_ms:.2f} ms')
        print(f'Throughput: {fps:.2f} FPS')

        del exe_network
--- a/tools/benchmark_tool/openvino/tools/benchmark/parameters.py
+++ b/tools/benchmark_tool/openvino/tools/benchmark/parameters.py
@@ -84,6 +84,8 @@ def parse_args():
                           'Also, using nstreams>1 is inherently throughput-oriented option, while for the best-latency '
                           'estimations the number of streams should be set to 1. '
                           'See samples README for more details.')
+    args.add_argument('--latency_percentile', type=int, required=False, default=50, choices=range(1,101),
+                      help='Optional. Defines the percentile to be reported in latency metric. The valid range is [1, 100]. The default value is 50 (median).')
    args.add_argument('-enforcebf16', '--enforce_bfloat16', type=str2bool, required=False, default=False, nargs='?', const=True, choices=[True, False],
                      help='Optional. By default floating point operations execution in bfloat16 precision are enforced if supported by platform. '
                           '\'true\'  - enable  bfloat16 regardless of platform support. '