* samples/python/benchmark/bert_benhcmark/requirements.txt: add datasets, install torch cpu only for linux By default for linux torch is installed with GPU support, but for windows is CPU only * benhcmark->benchmark
79 lines
3.0 KiB
Python
Executable File
79 lines
3.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
# Copyright (C) 2022 Intel Corporation
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
import logging as log
|
|
from pathlib import Path
|
|
import sys
|
|
import tempfile
|
|
from time import perf_counter
|
|
|
|
import datasets
|
|
from openvino.runtime import Core, get_version, AsyncInferQueue, PartialShape
|
|
from transformers import AutoTokenizer
|
|
from transformers.onnx import export
|
|
from transformers.onnx.features import FeaturesManager
|
|
|
|
|
|
def main():
|
|
log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout)
|
|
log.info('OpenVINO:')
|
|
log.info(f"{'Build ':.<39} {get_version()}")
|
|
model_name = 'bert-base-uncased'
|
|
# Download the model
|
|
transformers_model = FeaturesManager.get_model_from_feature('default', model_name)
|
|
_, model_onnx_config = FeaturesManager.check_supported_model_or_raise(transformers_model, feature='default')
|
|
onnx_config = model_onnx_config(transformers_model.config)
|
|
# Download the tokenizer
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
|
core = Core()
|
|
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
onnx_path = Path(tmp) / f'{model_name}.onnx'
|
|
# Export .onnx
|
|
export(tokenizer, transformers_model, onnx_config, onnx_config.default_onnx_opset, onnx_path)
|
|
# Read .onnx with OpenVINO
|
|
model = core.read_model(onnx_path)
|
|
|
|
# Enforce dynamic input shape
|
|
try:
|
|
model.reshape({model_input.any_name: PartialShape([1, '?']) for model_input in model.inputs})
|
|
except RuntimeError:
|
|
log.error("Can't set dynamic shape")
|
|
raise
|
|
# Optimize for throughput. Best throughput can be reached by
|
|
# running multiple openvino.runtime.InferRequest instances asyncronously
|
|
tput = {'PERFORMANCE_HINT': 'THROUGHPUT'}
|
|
# Pick a device by replacing CPU, for example MULTI:CPU(4),GPU(8).
|
|
# It is possible to set CUMULATIVE_THROUGHPUT as PERFORMANCE_HINT for AUTO device
|
|
compiled_model = core.compile_model(model, 'CPU', tput)
|
|
# AsyncInferQueue creates optimal number of InferRequest instances
|
|
ireqs = AsyncInferQueue(compiled_model)
|
|
|
|
sst2 = datasets.load_dataset('glue', 'sst2')
|
|
sst2_sentences = sst2['validation']['sentence']
|
|
# Warm up
|
|
encoded_warm_up = dict(tokenizer('Warm up sentence is here.', return_tensors='np'))
|
|
for _ in ireqs:
|
|
ireqs.start_async(encoded_warm_up)
|
|
ireqs.wait_all()
|
|
# Benchmark
|
|
sum_seq_len = 0
|
|
start = perf_counter()
|
|
for sentence in sst2_sentences:
|
|
encoded = dict(tokenizer(sentence, return_tensors='np'))
|
|
sum_seq_len += next(iter(encoded.values())).size # get sequence length to compute average length
|
|
ireqs.start_async(encoded)
|
|
ireqs.wait_all()
|
|
end = perf_counter()
|
|
duration = end - start
|
|
log.info(f'Average sequence length: {sum_seq_len / len(sst2_sentences):.2f}')
|
|
log.info(f'Average processing time: {duration / len(sst2_sentences) * 1e3:.2f} ms')
|
|
log.info(f'Duration: {duration:.2f} seconds')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|