Files
openvino/tools/benchmark/main.py
Mikhail Nosov d20900e235 [Caching] Add caching options to benchmark app (#4909)
* Python API for LoadNetwork by model file name

* BenchmarkApp: Add caching and LoadNetworkFromFile support

    2 new options are introduced
    - cache_dir <dir> - enables models caching
    - load_from_file - use new perform "LoadNetwork" by model file name

    Using both parameters will achieve maximum performance of read/load network on startup

    Tests:
    1) Run "benchmark_app -h". Help will display 2 new options. After available devices there will be list of devices with cache support
    2) ./benchmark_app -d CPU -i <model.xml> -load_from_file
    Verify that some test steps are skipped (related to ReadNetwork, re-shaping etc)
    3) Pre-requisite: support of caching shall be enabled for Template plugin
    ./benchmark_app -d TEMPLATE -i <model.onnx> -load_from_file -cache_dir someDir
    Verify that "someDir" is created and generated blob is available
    Run again, verify that loading works as well (should be faster as it will not load onnx model)
    4) Run same test as (3), but without -load_from_file option. Verify that cache is properly created
    For some devices loadNetwork time shall be improved when cache is available

* Removed additional timing prints

* Correction from old code

* Revert "Removed additional timing prints"

Additional change - when .blob is chosen instead of .xml, it takes priority over caching flags

* Removed new time printings

As discussed, these time measurements like 'total first inference time' will be available in 'timeTests' scripts

* Fix clang-format issues
2021-05-17 13:41:15 +03:00

407 lines
21 KiB
Python

# Copyright (C) 2018-2021 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import os
import sys
from datetime import datetime
from openvino.tools.benchmark.benchmark import Benchmark
from openvino.tools.benchmark.parameters import parse_args
from openvino.tools.benchmark.utils.constants import MULTI_DEVICE_NAME, HETERO_DEVICE_NAME, CPU_DEVICE_NAME, \
GPU_DEVICE_NAME, MYRIAD_DEVICE_NAME, GNA_DEVICE_NAME, BLOB_EXTENSION
from openvino.tools.benchmark.utils.inputs_filling import set_inputs
from openvino.tools.benchmark.utils.logging import logger
from openvino.tools.benchmark.utils.progress_bar import ProgressBar
from openvino.tools.benchmark.utils.utils import next_step, get_number_iterations, process_precision, \
process_help_inference_string, print_perf_counters, dump_exec_graph, get_duration_in_milliseconds, \
get_command_line_arguments, parse_nstreams_value_per_device, parse_devices, get_inputs_info, \
print_inputs_and_outputs_info, get_batch_size, load_config, dump_config
from openvino.tools.benchmark.utils.statistics_report import StatisticsReport, averageCntReport, detailedCntReport
def main():
# ------------------------------ 1. Parsing and validating input arguments -------------------------------------
next_step()
run(parse_args())
def run(args):
statistics = None
try:
if args.number_streams is None:
logger.warning(" -nstreams default value is determined automatically for a device. "
"Although the automatic selection usually provides a reasonable performance, "
"but it still may be non-optimal for some cases, for more information look at README. ")
command_line_arguments = get_command_line_arguments(sys.argv)
if args.report_type:
statistics = StatisticsReport(StatisticsReport.Config(args.report_type, args.report_folder))
statistics.add_parameters(StatisticsReport.Category.COMMAND_LINE_PARAMETERS, command_line_arguments)
def is_flag_set_in_command_line(flag):
return any(x.strip('-') == flag for x, y in command_line_arguments)
device_name = args.target_device
devices = parse_devices(device_name)
device_number_streams = parse_nstreams_value_per_device(devices, args.number_streams)
config = {}
if args.load_config:
load_config(args.load_config, config)
is_network_compiled = False
_, ext = os.path.splitext(args.path_to_model)
if ext == BLOB_EXTENSION:
is_network_compiled = True
print("Network is compiled")
# ------------------------------ 2. Loading Inference Engine ---------------------------------------------------
next_step(step_id=2)
benchmark = Benchmark(args.target_device, args.number_infer_requests,
args.number_iterations, args.time, args.api_type)
## CPU (MKLDNN) extensions
if CPU_DEVICE_NAME in device_name and args.path_to_extension:
benchmark.add_extension(path_to_extension=args.path_to_extension)
## GPU (clDNN) Extensions
if GPU_DEVICE_NAME in device_name and args.path_to_cldnn_config:
if GPU_DEVICE_NAME not in config.keys():
config[GPU_DEVICE_NAME] = {}
config[GPU_DEVICE_NAME]['CONFIG_FILE'] = args.path_to_cldnn_config
if GPU_DEVICE_NAME in config.keys() and 'CONFIG_FILE' in config[GPU_DEVICE_NAME].keys():
cldnn_config = config[GPU_DEVICE_NAME]['CONFIG_FILE']
benchmark.add_extension(path_to_cldnn_config=cldnn_config)
version = benchmark.get_version_info()
logger.info(version)
# --------------------- 3. Setting device configuration --------------------------------------------------------
next_step()
perf_counts = False
for device in devices:
if device not in config.keys():
config[device] = {}
## Set performance counter
if is_flag_set_in_command_line('pc'):
## set to user defined value
config[device]['PERF_COUNT'] = 'YES' if args.perf_counts else 'NO'
elif 'PERF_COUNT' in config[device].keys() and config[device]['PERF_COUNT'] == 'YES':
logger.warning(f"Performance counters for {device} device is turned on. " +
"To print results use -pc option.")
elif args.report_type in [ averageCntReport, detailedCntReport ]:
logger.warning(f"Turn on performance counters for {device} device " +
f"since report type is {args.report_type}.")
config[device]['PERF_COUNT'] = 'YES'
elif args.exec_graph_path is not None:
logger.warning(f"Turn on performance counters for {device} device " +
"due to execution graph dumping.")
config[device]['PERF_COUNT'] = 'YES'
else:
## set to default value
config[device]['PERF_COUNT'] = 'YES' if args.perf_counts else 'NO'
perf_counts = True if config[device]['PERF_COUNT'] == 'YES' else perf_counts
def set_throughput_streams():
key = device + "_THROUGHPUT_STREAMS"
if device in device_number_streams.keys():
## set to user defined value
supported_config_keys = benchmark.ie.get_metric(device, 'SUPPORTED_CONFIG_KEYS')
if key not in supported_config_keys:
raise Exception(f"Device {device} doesn't support config key '{key}'! " +
"Please specify -nstreams for correct devices in format <dev1>:<nstreams1>,<dev2>:<nstreams2>")
config[device][key] = device_number_streams[device]
elif key not in config[device].keys() and args.api_type == "async":
logger.warning(f"-nstreams default value is determined automatically for {device} device. " +
"Although the automatic selection usually provides a reasonable performance,"
"but it still may be non-optimal for some cases, for more information look at README.")
if device != MYRIAD_DEVICE_NAME: ## MYRIAD sets the default number of streams implicitly
config[device][key] = device + "_THROUGHPUT_AUTO"
if key in config[device].keys():
device_number_streams[device] = config[device][key]
if device == CPU_DEVICE_NAME: # CPU supports few special performance-oriented keys
# limit threading for CPU portion of inference
if args.number_threads and is_flag_set_in_command_line("nthreads"):
config[device]['CPU_THREADS_NUM'] = str(args.number_threads)
if is_flag_set_in_command_line("enforcebf16") or is_flag_set_in_command_line("enforce_bfloat16"):
config[device]['ENFORCE_BF16'] = 'YES' if args.enforce_bfloat16 else 'NO'
if is_flag_set_in_command_line('pin'):
## set to user defined value
config[device]['CPU_BIND_THREAD'] = args.infer_threads_pinning
elif 'CPU_BIND_THREAD' not in config[device].keys():
if MULTI_DEVICE_NAME in device_name and GPU_DEVICE_NAME in device_name:
logger.warning(f"Turn off threads pinning for {device} " +
"device since multi-scenario with GPU device is used.")
config[device]['CPU_BIND_THREAD'] = 'NO'
## for CPU execution, more throughput-oriented execution via streams
set_throughput_streams()
elif device == GPU_DEVICE_NAME:
## for GPU execution, more throughput-oriented execution via streams
set_throughput_streams()
if MULTI_DEVICE_NAME in device_name and CPU_DEVICE_NAME in device_name:
logger.warning("Turn on GPU trottling. Multi-device execution with the CPU + GPU performs best with GPU trottling hint, " +
"which releases another CPU thread (that is otherwise used by the GPU driver for active polling)")
config[device]['CLDNN_PLUGIN_THROTTLE'] = '1'
elif device == MYRIAD_DEVICE_NAME:
set_throughput_streams()
config[device]['LOG_LEVEL'] = 'LOG_INFO'
elif device == GNA_DEVICE_NAME:
if is_flag_set_in_command_line('qb'):
if args.qb == 8:
config[device]['GNA_PRECISION'] = 'I8'
else:
config[device]['GNA_PRECISION'] = 'I16'
if args.number_threads and is_flag_set_in_command_line("nthreads"):
config[device]['GNA_LIB_N_THREADS'] = str(args.number_threads)
else:
supported_config_keys = benchmark.ie.get_metric(device, 'SUPPORTED_CONFIG_KEYS')
if 'CPU_THREADS_NUM' in supported_config_keys and args.number_threads and is_flag_set_in_command_line("nthreads"):
config[device]['CPU_THREADS_NUM'] = str(args.number_threads)
if 'CPU_THROUGHPUT_STREAMS' in supported_config_keys and args.number_streams and is_flag_set_in_command_line("streams"):
config[device]['CPU_THROUGHPUT_STREAMS'] = args.number_streams
if 'CPU_BIND_THREAD' in supported_config_keys and args.infer_threads_pinning and is_flag_set_in_command_line("pin"):
config[device]['CPU_BIND_THREAD'] = args.infer_threads_pinning
perf_counts = perf_counts
benchmark.set_config(config)
batch_size = args.batch_size
if args.cache_dir:
benchmark.set_cache_dir(args.cache_dir)
topology_name = ""
load_from_file_enabled = is_flag_set_in_command_line('load_from_file') or is_flag_set_in_command_line('lfile')
if load_from_file_enabled and not is_network_compiled:
next_step()
print("Skipping the step for loading network from file")
next_step()
print("Skipping the step for loading network from file")
next_step()
print("Skipping the step for loading network from file")
# --------------------- 7. Loading the model to the device -------------------------------------------------
next_step()
start_time = datetime.utcnow()
exe_network = benchmark.load_network(args.path_to_model)
duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
logger.info(f"Load network took {duration_ms} ms")
if statistics:
statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
[
('load network time (ms)', duration_ms)
])
app_inputs_info, _ = get_inputs_info(args.shape, args.layout, args.batch_size, exe_network.input_info)
if batch_size == 0:
batch_size = 1
elif not is_network_compiled:
# --------------------- 4. Read the Intermediate Representation of the network -----------------------------
next_step()
start_time = datetime.utcnow()
ie_network = benchmark.read_network(args.path_to_model)
topology_name = ie_network.name
duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
logger.info(f"Read network took {duration_ms} ms")
if statistics:
statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
[
('read network time (ms)', duration_ms)
])
# --------------------- 5. Resizing network to match image sizes and given batch ---------------------------
next_step()
app_inputs_info, reshape = get_inputs_info(args.shape, args.layout, args.batch_size, ie_network.input_info)
if reshape:
start_time = datetime.utcnow()
shapes = { k : v.shape for k,v in app_inputs_info.items() }
logger.info(
'Reshaping network: {}'.format(', '.join("'{}': {}".format(k, v) for k, v in shapes.items())))
ie_network.reshape(shapes)
duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
logger.info(f"Reshape network took {duration_ms} ms")
if statistics:
statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
[
('reshape network time (ms)', duration_ms)
])
# use batch size according to provided layout and shapes
batch_size = get_batch_size(app_inputs_info) if args.layout else ie_network.batch_size
logger.info(f'Network batch size: {batch_size}')
# --------------------- 6. Configuring inputs and outputs of the model --------------------------------------------------
next_step()
process_precision(ie_network, app_inputs_info, args.input_precision, args.output_precision, args.input_output_precision)
print_inputs_and_outputs_info(ie_network)
# --------------------- 7. Loading the model to the device -------------------------------------------------
next_step()
start_time = datetime.utcnow()
exe_network = benchmark.load_network(ie_network)
duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
logger.info(f"Load network took {duration_ms} ms")
if statistics:
statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
[
('load network time (ms)', duration_ms)
])
else:
next_step()
print("Skipping the step for compiled network")
next_step()
print("Skipping the step for compiled network")
next_step()
print("Skipping the step for compiled network")
# --------------------- 7. Loading the model to the device -------------------------------------------------
next_step()
start_time = datetime.utcnow()
exe_network = benchmark.import_network(args.path_to_model)
duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
logger.info(f"Import network took {duration_ms} ms")
if statistics:
statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
[
('import network time (ms)', duration_ms)
])
app_inputs_info, _ = get_inputs_info(args.shape, args.layout, args.batch_size, exe_network.input_info)
if batch_size == 0:
batch_size = 1
# --------------------- 8. Setting optimal runtime parameters --------------------------------------------------
next_step()
# Update number of streams
for device in device_number_streams.keys():
key = device + '_THROUGHPUT_STREAMS'
device_number_streams[device] = benchmark.ie.get_config(device, key)
# Number of requests
infer_requests = exe_network.requests
# Iteration limit
benchmark.niter = get_number_iterations(benchmark.niter, benchmark.nireq, args.api_type)
# ------------------------------------ 9. Creating infer requests and filling input blobs ----------------------
next_step()
paths_to_input = list()
if args.paths_to_input:
for path in args.paths_to_input:
paths_to_input.append(os.path.abspath(*path) if args.paths_to_input else None)
set_inputs(paths_to_input, batch_size, app_inputs_info, infer_requests)
if statistics:
statistics.add_parameters(StatisticsReport.Category.RUNTIME_CONFIG,
[
('topology', topology_name),
('target device', device_name),
('API', args.api_type),
('precision', "UNSPECIFIED"),
('batch size', str(batch_size)),
('number of iterations', str(benchmark.niter) if benchmark.niter else "0"),
('number of parallel infer requests', str(benchmark.nireq)),
('duration (ms)', str(get_duration_in_milliseconds(benchmark.duration_seconds))),
])
for nstreams in device_number_streams.items():
statistics.add_parameters(StatisticsReport.Category.RUNTIME_CONFIG,
[
(f"number of {nstreams[0]} streams", str(nstreams[1])),
])
# ------------------------------------ 10. Measuring performance -----------------------------------------------
output_string = process_help_inference_string(benchmark)
next_step(additional_info=output_string)
progress_bar_total_count = 10000
if benchmark.niter and not benchmark.duration_seconds:
progress_bar_total_count = benchmark.niter
progress_bar = ProgressBar(progress_bar_total_count, args.stream_output, args.progress) if args.progress else None
duration_ms = f"{benchmark.first_infer(exe_network):.2f}"
logger.info(f"First inference took {duration_ms} ms")
if statistics:
statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
[
('first inference time (ms)', duration_ms)
])
fps, latency_ms, total_duration_sec, iteration = benchmark.infer(exe_network, batch_size, progress_bar)
# ------------------------------------ 11. Dumping statistics report -------------------------------------------
next_step()
if args.dump_config:
dump_config(args.dump_config, config)
logger.info(f"Inference Engine configuration settings were dumped to {args.dump_config}")
if args.exec_graph_path:
dump_exec_graph(exe_network, args.exec_graph_path)
if perf_counts:
perfs_count_list = []
for ni in range(int(benchmark.nireq)):
perfs_count_list.append(exe_network.requests[ni].get_perf_counts())
if args.perf_counts:
print_perf_counters(perfs_count_list)
if statistics:
statistics.dump_performance_counters(perfs_count_list)
if statistics:
statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
[
('total execution time (ms)', f'{get_duration_in_milliseconds(total_duration_sec):.2f}'),
('total number of iterations', str(iteration)),
])
if MULTI_DEVICE_NAME not in device_name:
statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
[
('latency (ms)', f'{latency_ms:.2f}'),
])
statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
[
('throughput', f'{fps:.2f}'),
])
if statistics:
statistics.dump()
print(f'Count: {iteration} iterations')
print(f'Duration: {get_duration_in_milliseconds(total_duration_sec):.2f} ms')
if MULTI_DEVICE_NAME not in device_name:
print(f'Latency: {latency_ms:.2f} ms')
print(f'Throughput: {fps:.2f} FPS')
del exe_network
next_step.step_id = 0
except Exception as e:
logger.exception(e)
if statistics:
statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
[
('error', str(e)),
])
statistics.dump()
sys.exit(1)