Intoduce -latency_percentile flag for the benchmark_app tool (#6479)
* Introduce new -latency_percentile flag for benchmark_app * Fix syntax
This commit is contained in:
@@ -95,6 +95,7 @@ Options:
|
||||
-layout Optional. Prompts how network layouts should be treated by application. For example, "input1[NCHW],input2[NC]" or "[NCHW]" in case of one input size.
|
||||
-cache_dir "<path>" Optional. Enables caching of loaded models to specified directory.
|
||||
-load_from_file Optional. Loads model from file directly without ReadNetwork.
|
||||
-latency_percentile Optional. Defines the percentile to be reported in latency metric. The valid range is [1, 100]. The default value is 50 (median).
|
||||
|
||||
CPU-specific performance options:
|
||||
-nstreams "<integer>" Optional. Number of streams to use for inference on the CPU, GPU or MYRIAD devices
|
||||
|
||||
@@ -56,6 +56,10 @@ static const char infer_num_streams_message[] = "Optional. Number of streams to
|
||||
"Also, using nstreams>1 is inherently throughput-oriented option, "
|
||||
"while for the best-latency estimations the number of streams should be set to 1.";
|
||||
|
||||
/// @brief message for latency percentile settings
|
||||
static const char infer_latency_percentile_message[] =
|
||||
"Optional. Defines the percentile to be reported in latency metric. The valid range is [1, 100]. The default value is 50 (median).";
|
||||
|
||||
/// @brief message for enforcing of BF16 execution where it is possible
|
||||
static const char enforce_bf16_message[] = "Optional. By default floating point operations execution in bfloat16 precision are enforced "
|
||||
"if supported by platform.\n"
|
||||
@@ -189,6 +193,9 @@ DEFINE_uint32(nthreads, 0, infer_num_threads_message);
|
||||
/// @brief Number of streams to use for inference on the CPU (also affects Hetero cases)
|
||||
DEFINE_string(nstreams, "", infer_num_streams_message);
|
||||
|
||||
/// @brief The percentile which will be reported in latency metric
|
||||
DEFINE_uint32(latency_percentile, 50, infer_latency_percentile_message);
|
||||
|
||||
/// @brief Enforces bf16 execution with bfloat16 precision on systems having this capability
|
||||
DEFINE_bool(enforcebf16, false, enforce_bf16_message);
|
||||
|
||||
@@ -278,6 +285,7 @@ static void showUsage() {
|
||||
std::cout << " -layout " << layout_message << std::endl;
|
||||
std::cout << " -cache_dir \"<path>\" " << cache_dir_message << std::endl;
|
||||
std::cout << " -load_from_file " << load_from_file_message << std::endl;
|
||||
std::cout << " -latency_percentile " << infer_latency_percentile_message << std::endl;
|
||||
std::cout << std::endl << " device-specific performance options:" << std::endl;
|
||||
std::cout << " -nstreams \"<integer>\" " << infer_num_streams_message << std::endl;
|
||||
std::cout << " -nthreads \"<integer>\" " << infer_num_threads_message << std::endl;
|
||||
|
||||
@@ -52,6 +52,10 @@ bool ParseAndCheckCommandLine(int argc, char* argv[]) {
|
||||
throw std::logic_error("Model is required but not set. Please set -m option.");
|
||||
}
|
||||
|
||||
if (FLAGS_latency_percentile > 100 || FLAGS_latency_percentile < 1) {
|
||||
showUsage();
|
||||
throw std::logic_error("The percentile value is incorrect. The applicable values range is [1, 100].");
|
||||
}
|
||||
if (FLAGS_api != "async" && FLAGS_api != "sync") {
|
||||
throw std::logic_error("Incorrect API. Please set -api option to `sync` or `async` value.");
|
||||
}
|
||||
@@ -100,11 +104,10 @@ static void next_step(const std::string additional_info = "") {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T getMedianValue(const std::vector<T>& vec) {
|
||||
T getMedianValue(const std::vector<T>& vec, std::size_t percentile) {
|
||||
std::vector<T> sortedVec(vec);
|
||||
std::sort(sortedVec.begin(), sortedVec.end());
|
||||
return (sortedVec.size() % 2 != 0) ? sortedVec[sortedVec.size() / 2ULL]
|
||||
: (sortedVec[sortedVec.size() / 2ULL] + sortedVec[sortedVec.size() / 2ULL - 1ULL]) / static_cast<T>(2.0);
|
||||
return sortedVec[(sortedVec.size() / 100) * percentile];
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -624,7 +627,7 @@ int main(int argc, char* argv[]) {
|
||||
// wait the latest inference executions
|
||||
inferRequestsQueue.waitAll();
|
||||
|
||||
double latency = getMedianValue<double>(inferRequestsQueue.getLatencies());
|
||||
double latency = getMedianValue<double>(inferRequestsQueue.getLatencies(), FLAGS_latency_percentile);
|
||||
double totalDuration = inferRequestsQueue.getDurationInMilliseconds();
|
||||
double fps = (FLAGS_api == "sync") ? batchSize * 1000.0 / latency : batchSize * 1000.0 * iteration / totalDuration;
|
||||
|
||||
@@ -634,8 +637,14 @@ int main(int argc, char* argv[]) {
|
||||
{"total number of iterations", std::to_string(iteration)},
|
||||
});
|
||||
if (device_name.find("MULTI") == std::string::npos) {
|
||||
std::string latency_label;
|
||||
if (FLAGS_latency_percentile == 50) {
|
||||
latency_label = "latency (ms)";
|
||||
} else {
|
||||
latency_label = "latency (" + std::to_string(FLAGS_latency_percentile) + " percentile) (ms)";
|
||||
}
|
||||
statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, {
|
||||
{"latency (ms)", double_to_string(latency)},
|
||||
{latency_label, double_to_string(latency)},
|
||||
});
|
||||
}
|
||||
statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, {{"throughput", double_to_string(fps)}});
|
||||
@@ -684,8 +693,15 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
std::cout << "Count: " << iteration << " iterations" << std::endl;
|
||||
std::cout << "Duration: " << double_to_string(totalDuration) << " ms" << std::endl;
|
||||
if (device_name.find("MULTI") == std::string::npos)
|
||||
std::cout << "Latency: " << double_to_string(latency) << " ms" << std::endl;
|
||||
if (device_name.find("MULTI") == std::string::npos) {
|
||||
std::cout << "Latency";
|
||||
if (FLAGS_latency_percentile == 50) {
|
||||
std::cout << ": ";
|
||||
} else {
|
||||
std::cout << " (" << FLAGS_latency_percentile << " percentile): ";
|
||||
}
|
||||
std::cout << double_to_string(latency) << " ms" << std::endl;
|
||||
}
|
||||
std::cout << "Throughput: " << double_to_string(fps) << " FPS" << std::endl;
|
||||
} catch (const std::exception& ex) {
|
||||
slog::err << ex.what() << slog::endl;
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
from statistics import median
|
||||
from math import ceil
|
||||
from openvino.inference_engine import IENetwork, IECore, get_version, StatusCode
|
||||
|
||||
from .utils.constants import MULTI_DEVICE_NAME, HETERO_DEVICE_NAME, CPU_DEVICE_NAME, GPU_DEVICE_NAME, XML_EXTENSION, BIN_EXTENSION
|
||||
@@ -11,6 +11,9 @@ from .utils.logging import logger
|
||||
from .utils.utils import get_duration_seconds
|
||||
from .utils.statistics_report import StatisticsReport
|
||||
|
||||
def percentile(values, percent):
|
||||
return values[ceil(len(values) * percent / 100) - 1]
|
||||
|
||||
class Benchmark:
|
||||
def __init__(self, device: str, number_infer_requests: int = None, number_iterations: int = None,
|
||||
duration_seconds: int = None, api_type: str = 'async'):
|
||||
@@ -98,7 +101,7 @@ class Benchmark:
|
||||
raise Exception(f"Wait for all requests is failed with status code {status}!")
|
||||
return infer_request.latency
|
||||
|
||||
def infer(self, exe_network, batch_size, progress_bar=None):
|
||||
def infer(self, exe_network, batch_size, latency_percentile, progress_bar=None):
|
||||
progress_count = 0
|
||||
infer_requests = exe_network.requests
|
||||
|
||||
@@ -155,7 +158,7 @@ class Benchmark:
|
||||
for infer_request_id in in_fly:
|
||||
times.append(infer_requests[infer_request_id].latency)
|
||||
times.sort()
|
||||
latency_ms = median(times)
|
||||
latency_ms = percentile(times, latency_percentile)
|
||||
fps = batch_size * 1000 / latency_ms if self.api_type == 'sync' else batch_size * iteration / total_duration_sec
|
||||
if progress_bar:
|
||||
progress_bar.finish()
|
||||
|
||||
@@ -344,7 +344,7 @@ def run(args):
|
||||
[
|
||||
('first inference time (ms)', duration_ms)
|
||||
])
|
||||
fps, latency_ms, total_duration_sec, iteration = benchmark.infer(exe_network, batch_size, progress_bar)
|
||||
fps, latency_ms, total_duration_sec, iteration = benchmark.infer(exe_network, batch_size, args.latency_percentile, progress_bar)
|
||||
|
||||
# ------------------------------------ 11. Dumping statistics report -------------------------------------------
|
||||
next_step()
|
||||
@@ -372,9 +372,13 @@ def run(args):
|
||||
('total number of iterations', str(iteration)),
|
||||
])
|
||||
if MULTI_DEVICE_NAME not in device_name:
|
||||
if args.latency_percentile == 50:
|
||||
latency_prefix = 'latency (ms)'
|
||||
else:
|
||||
latency_prefix = 'latency (' + args.latency_percentile + ' percentile) (ms)'
|
||||
statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
|
||||
[
|
||||
('latency (ms)', f'{latency_ms:.2f}'),
|
||||
(latency_prefix, f'{latency_ms:.2f}'),
|
||||
])
|
||||
|
||||
statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
|
||||
@@ -388,7 +392,10 @@ def run(args):
|
||||
print(f'Count: {iteration} iterations')
|
||||
print(f'Duration: {get_duration_in_milliseconds(total_duration_sec):.2f} ms')
|
||||
if MULTI_DEVICE_NAME not in device_name:
|
||||
print(f'Latency: {latency_ms:.2f} ms')
|
||||
if args.latency_percentile == 50:
|
||||
print(f'Latency: {latency_ms:.2f} ms')
|
||||
else:
|
||||
print(f'Latency ({args.latency_percentile} percentile): {latency_ms:.2f} ms')
|
||||
print(f'Throughput: {fps:.2f} FPS')
|
||||
|
||||
del exe_network
|
||||
|
||||
@@ -84,6 +84,8 @@ def parse_args():
|
||||
'Also, using nstreams>1 is inherently throughput-oriented option, while for the best-latency '
|
||||
'estimations the number of streams should be set to 1. '
|
||||
'See samples README for more details.')
|
||||
args.add_argument('--latency_percentile', type=int, required=False, default=50, choices=range(1,101),
|
||||
help='Optional. Defines the percentile to be reported in latency metric. The valid range is [1, 100]. The default value is 50 (median).')
|
||||
args.add_argument('-enforcebf16', '--enforce_bfloat16', type=str2bool, required=False, default=False, nargs='?', const=True, choices=[True, False],
|
||||
help='Optional. By default floating point operations execution in bfloat16 precision are enforced if supported by platform. '
|
||||
'\'true\' - enable bfloat16 regardless of platform support. '
|
||||
|
||||
Reference in New Issue
Block a user