[CONFORMANCE] Parallelization over HW devices (#16431)

* init

* just fix version

* Update merge script

* remove extra code

* Uncomment correct func

* dd

* validate_nvidia

* Small refactoring

* Trigger linux build

* Update main.cpp

revert

* trigger

* fix build

* Update main.cpp
This commit is contained in:
Irina Efode 2023-03-30 14:45:49 +04:00 committed by GitHub
parent 086ee93bcd
commit 87365fa21d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 140 additions and 39 deletions

View File

@ -27,11 +27,13 @@ def parse_arguments():
output_folders_help = "Path to folder to save report" output_folders_help = "Path to folder to save report"
output_filename_help = "Output report filename" output_filename_help = "Output report filename"
report_type_help = "Report type: OP or API" report_type_help = "Report type: OP or API"
merge_device_id_help = "Merge all devices with suffix to one main device. Example: GPU.0 and GPU.1 -> GPU"
parser.add_argument("-i", "--input_folders", help=input_folders_help, nargs="*", required=True) parser.add_argument("-i", "--input_folders", help=input_folders_help, nargs="*", required=True)
parser.add_argument("-o", "--output_folder", help=output_folders_help, default=".") parser.add_argument("-o", "--output_folder", help=output_folders_help, default=".")
parser.add_argument("-f", "--output_filename", help=output_filename_help, default="report") parser.add_argument("-f", "--output_filename", help=output_filename_help, default="report")
parser.add_argument("-t", "--report_type", help=report_type_help, default="OP") parser.add_argument("-t", "--report_type", help=report_type_help, default="OP")
parser.add_argument("-m", "--merge_device_id", help=merge_device_id_help, default=False)
return parser.parse_args() return parser.parse_args()
@ -53,7 +55,8 @@ def update_result_node(xml_node: SubElement, aggregated_res: SubElement):
aggregated_res.set(attr_name, str(xml_value + aggregated_value)) aggregated_res.set(attr_name, str(xml_value + aggregated_value))
def aggregate_test_results(aggregated_results: SubElement, xml_reports: list, report_type: str): def aggregate_test_results(aggregated_results: SubElement, xml_reports: list,
report_type: str, merge_device_suffix=False):
aggregated_timestamp = None aggregated_timestamp = None
for xml in xml_reports: for xml in xml_reports:
# logger.info(f" Processing: {xml}") # logger.info(f" Processing: {xml}")
@ -67,16 +70,22 @@ def aggregate_test_results(aggregated_results: SubElement, xml_reports: list, re
if aggregated_timestamp is None or xml_timestamp < aggregated_timestamp: if aggregated_timestamp is None or xml_timestamp < aggregated_timestamp:
aggregated_timestamp = xml_timestamp aggregated_timestamp = xml_timestamp
for xml_device_entry in xml_results: for xml_device_entry in xml_results:
device_name = xml_device_entry.tag
if merge_device_suffix and "." in xml_device_entry.tag:
device_name = xml_device_entry.tag[:xml_device_entry.tag.find("."):]
new_data = ET.tostring(xml_device_entry).decode('utf8').replace(xml_device_entry.tag, device_name)
xml_device_entry = ET.fromstring(new_data)
aggregated_device_results = aggregated_results.find(xml_device_entry.tag) aggregated_device_results = aggregated_results.find(xml_device_entry.tag)
if aggregated_device_results is None:
aggregated_results.append(xml_device_entry)
aggregated_device_results = aggregated_results.find(xml_device_entry.tag)
# op or api_type
for xml_results_entry in xml_device_entry: for xml_results_entry in xml_device_entry:
aggregated_results_entry = aggregated_device_results.find(xml_results_entry.tag) aggregated_results_entry = None
if not aggregated_device_results is None:
aggregated_results_entry = aggregated_device_results.find(xml_results_entry.tag)
if aggregated_results_entry is None: if aggregated_results_entry is None:
stat_update_utils.update_rel_values(xml_results_entry) stat_update_utils.update_rel_values(xml_results_entry)
aggregated_device_results.append(xml_results_entry) if aggregated_device_results is None:
aggregated_results.append(xml_device_entry)
else:
aggregated_device_results.append(xml_results_entry)
continue continue
if report_type == "OP": if report_type == "OP":
update_result_node(xml_results_entry, aggregated_results_entry) update_result_node(xml_results_entry, aggregated_results_entry)
@ -91,7 +100,8 @@ def aggregate_test_results(aggregated_results: SubElement, xml_reports: list, re
return aggregated_timestamp return aggregated_timestamp
def merge_xml(input_folder_paths: list, output_folder_paths: str, output_filename: str, report_type: str): def merge_xml(input_folder_paths: list, output_folder_paths: str, output_filename: str,
report_type: str, merge_device_suffix=False):
logger.info(f" Processing is finished") logger.info(f" Processing is finished")
summary = Element("report") summary = Element("report")
@ -133,7 +143,7 @@ def merge_xml(input_folder_paths: list, output_folder_paths: str, output_filenam
for entity in xml_root.find(entity_name): for entity in xml_root.find(entity_name):
if entity_list.find(entity.tag) is None: if entity_list.find(entity.tag) is None:
SubElement(entity_list, entity.tag) SubElement(entity_list, entity.tag)
timestamp = aggregate_test_results(results, xml_reports, report_type) timestamp = aggregate_test_results(results, xml_reports, report_type, merge_device_suffix)
if report_type == "OP": if report_type == "OP":
stat_update_utils.update_passrates(results) stat_update_utils.update_passrates(results)
else: else:
@ -154,4 +164,4 @@ def merge_xml(input_folder_paths: list, output_folder_paths: str, output_filenam
if __name__ == "__main__": if __name__ == "__main__":
arguments = parse_arguments() arguments = parse_arguments()
merge_xml(arguments.input_folders, arguments.output_folder, arguments.output_filename, arguments.report_type) merge_xml(arguments.input_folders, arguments.output_folder, arguments.output_filename, arguments.report_type, arguments.merge_device_id)

View File

@ -19,12 +19,12 @@ from utils.conformance_utils import get_logger
from utils import file_utils from utils import file_utils
logger = get_logger('conformance_runner') logger = get_logger('conformance_runner')
is_hash = True has_python_api = True
try: try:
from rename_conformance_ir import create_hash from rename_conformance_ir import create_hash
except: except:
logger.warning("Please set the above env variable to get the same conformance ir names run by run!") logger.warning("Please set the above env variable to get the same conformance ir names run by run!")
is_hash = False has_python_api = False
API_CONFORMANCE_BIN_NAME = "apiConformanceTests" API_CONFORMANCE_BIN_NAME = "apiConformanceTests"
OP_CONFORMANCE_BIN_NAME = "conformanceTests" OP_CONFORMANCE_BIN_NAME = "conformanceTests"
@ -135,7 +135,7 @@ class Conformance:
logger.error("Process failed on step: 'Subgraph dumping'") logger.error("Process failed on step: 'Subgraph dumping'")
exit(-1) exit(-1)
self._model_path = conformance_ir_path self._model_path = conformance_ir_path
if is_hash: if has_python_api:
create_hash(Path(self._model_path)) create_hash(Path(self._model_path))
logger.info(f"All conformance IRs in {self._ov_bin_path} were renamed based on hash") logger.info(f"All conformance IRs in {self._ov_bin_path} were renamed based on hash")
else: else:
@ -173,7 +173,7 @@ class Conformance:
final_report_name = f'report_{self._type.lower()}' final_report_name = f'report_{self._type.lower()}'
# API Conformance contains both report type # API Conformance contains both report type
merge_xml([parallel_report_dir], report_dir, final_report_name, self._type) merge_xml([parallel_report_dir], report_dir, final_report_name, self._type, True)
if self._type == constants.API_CONFORMANCE: if self._type == constants.API_CONFORMANCE:
final_op_report_name = f'report_{constants.OP_CONFORMANCE.lower()}' final_op_report_name = f'report_{constants.OP_CONFORMANCE.lower()}'
merge_xml([parallel_report_dir], report_dir, final_op_report_name, constants.OP_CONFORMANCE.lower()) merge_xml([parallel_report_dir], report_dir, final_op_report_name, constants.OP_CONFORMANCE.lower())

View File

@ -13,7 +13,6 @@ from shutil import rmtree
import os import os
import sys import sys
import threading import threading
import platform
import csv import csv
import datetime import datetime
import shlex import shlex
@ -23,14 +22,20 @@ if sys.version_info.major >= 3:
else: else:
import thread import thread
has_python_api = True
logger = get_logger('test_parallel_runner')
try:
from utils.get_available_devices import get_available_devices
except:
logger.warning("Please set the above env variable to get the same conformance ir names run by run!")
has_python_api = False
FILENAME_LENGTH = 255 FILENAME_LENGTH = 255
LOG_NAME_REPLACE_STR = "##NAME##" LOG_NAME_REPLACE_STR = "##NAME##"
DEFAULT_PROCESS_TIMEOUT = 3600 DEFAULT_PROCESS_TIMEOUT = 3600
DEFAULT_TEST_TIMEOUT = 900 DEFAULT_TEST_TIMEOUT = 900
MAX_LENGHT = 4096 if not constants.IS_WIN else 8191 MAX_LENGHT = 4096 if not constants.IS_WIN else 8191
logger = get_logger('test_parallel_runner')
def parse_arguments(): def parse_arguments():
parser = ArgumentParser() parser = ArgumentParser()
exec_file_path_help = "Path to the test executable file" exec_file_path_help = "Path to the test executable file"
@ -38,10 +43,12 @@ def parse_arguments():
worker_num_help = "Worker number. Default value is `cpu_count-1` " worker_num_help = "Worker number. Default value is `cpu_count-1` "
working_dir_num_help = "Working dir" working_dir_num_help = "Working dir"
process_timeout_help = "Process timeout in s" process_timeout_help = "Process timeout in s"
parallel_help = "Parallel over HW devices. For example run tests over GPU.0, GPU.1 and etc"
parser.add_argument("-e", "--exec_file", help=exec_file_path_help, type=str, required=True) parser.add_argument("-e", "--exec_file", help=exec_file_path_help, type=str, required=True)
parser.add_argument("-c", "--cache_path", help=cache_path_help, type=str, required=False, default="") parser.add_argument("-c", "--cache_path", help=cache_path_help, type=str, required=False, default="")
parser.add_argument("-j", "--workers", help=worker_num_help, type=int, required=False, default=(os.cpu_count() - 1) if os.cpu_count() > 2 else 1) parser.add_argument("-j", "--workers", help=worker_num_help, type=int, required=False, default=(os.cpu_count() - 1) if os.cpu_count() > 2 else 1)
parser.add_argument("-p", "--parallel_devices", help=parallel_help, type=int, required=False, default=1)
parser.add_argument("-w", "--working_dir", help=working_dir_num_help, type=str, required=False, default=".") parser.add_argument("-w", "--working_dir", help=working_dir_num_help, type=str, required=False, default=".")
parser.add_argument("-t", "--process_timeout", help=process_timeout_help, type=int, required=False, default=DEFAULT_PROCESS_TIMEOUT) parser.add_argument("-t", "--process_timeout", help=process_timeout_help, type=int, required=False, default=DEFAULT_PROCESS_TIMEOUT)
return parser.parse_args() return parser.parse_args()
@ -55,6 +62,21 @@ def get_test_command_line_args():
break break
return command_line_args return command_line_args
def get_device_by_args(args: list):
device = None
is_device = False
for argument in args:
if "--device" in argument:
is_device = True
if argument.find("=") == -1:
continue
device = argument[argument.find("=")+1:]
break
if is_device and argument[0] != "-":
device = argument
break
return device
# Class to read test cache # Class to read test cache
class TestStructure: class TestStructure:
_name = "" _name = ""
@ -67,7 +89,7 @@ class TestStructure:
class TaskManager: class TaskManager:
process_timeout = -1 process_timeout = -1
def __init__(self, command_list:list, working_dir: os.path, prev_run_cmd_length = 0): def __init__(self, command_list:list, working_dir: os.path, prev_run_cmd_length=0, device=None, available_devices=list()):
self._command_list = command_list self._command_list = command_list
self._process_list = list() self._process_list = list()
self._workers = list() self._workers = list()
@ -75,6 +97,14 @@ class TaskManager:
self._log_filename = os.path.join(working_dir, f"log_{LOG_NAME_REPLACE_STR}.log") self._log_filename = os.path.join(working_dir, f"log_{LOG_NAME_REPLACE_STR}.log")
self._prev_run_cmd_length = prev_run_cmd_length self._prev_run_cmd_length = prev_run_cmd_length
self._idx = 0 self._idx = 0
self._device = device
if self._device is None:
self._device = "NOT_AFFECTED_BY_DEVICE"
if len(available_devices) > 0:
self._available_devices = available_devices
else:
self._available_devices = [self._device]
self._device_cnt = len(self._available_devices)
def __create_thread(self, func): def __create_thread(self, func):
thread = threading.Thread(target=func) thread = threading.Thread(target=func)
@ -86,19 +116,23 @@ class TaskManager:
if len(self._command_list) <= self._idx: if len(self._command_list) <= self._idx:
logger.warning(f"Skip worker initialiazation. Command list lenght <= worker index") logger.warning(f"Skip worker initialiazation. Command list lenght <= worker index")
return return
log_file_name = self._log_filename.replace(LOG_NAME_REPLACE_STR, str(self._idx + self._prev_run_cmd_length)) if self._device_cnt == 0:
with open(log_file_name, "w") as log_file: logger.error(f"Empty available devices! Check your device!")
args = self._command_list[self._idx] exit(-1)
if not constants.IS_WIN: for target_device in self._available_devices:
args = shlex.split(self._command_list[self._idx]) log_file_name = self._log_filename.replace(LOG_NAME_REPLACE_STR, str(self._idx + self._prev_run_cmd_length))
worker = self.__create_thread( with open(log_file_name, "w") as log_file:
self._process_list.append(Popen(args, shell=constants.IS_WIN, stdout=log_file, stderr=log_file))) args = self._command_list[self._idx].replace(self._device, target_device)
self._workers.append(worker) if not constants.IS_WIN:
worker.join() args = shlex.split(args)
self._timers.append(datetime.datetime.now()) worker = self.__create_thread(
log_file.close() self._process_list.append(Popen(args, shell=constants.IS_WIN, stdout=log_file, stderr=log_file)))
# logger.info(f"{self._idx}/{len(self._command_list)} is started") self._workers.append(worker)
self._idx += 1 worker.join()
self._timers.append(datetime.datetime.now())
log_file.close()
# logger.info(f"{self._idx}/{len(self._command_list)} is started")
self._idx += 1
def __find_free_process(self): def __find_free_process(self):
while True: while True:
@ -108,24 +142,25 @@ class TaskManager:
logger.warning(f"Process {pid} exceed time limetattion per process") logger.warning(f"Process {pid} exceed time limetattion per process")
self._process_list[pid].kill() self._process_list[pid].kill()
self._process_list[pid].wait(timeout=0) self._process_list[pid].wait(timeout=0)
device = get_device_by_args(self._process_list[pid].args)
# logger.info(f"{self._idx}/{len(self._command_list)} is started") # logger.info(f"{self._idx}/{len(self._command_list)} is started")
return pid return pid, device
except TimeoutExpired: except TimeoutExpired:
continue continue
def __update_process(self, pid:int, log_file): def __update_process(self, pid:int, log_file, device):
args = self._command_list[self._idx] args = self._command_list[self._idx].replace(self._device, device)
if not constants.IS_WIN: if not constants.IS_WIN:
args = shlex.split(self._command_list[self._idx]) args = shlex.split(args)
self._process_list[pid] = Popen(args, shell=constants.IS_WIN, stdout=log_file, stderr=log_file) self._process_list[pid] = Popen(args, shell=constants.IS_WIN, stdout=log_file, stderr=log_file)
def update_worker(self): def update_worker(self):
if self._idx >= len(self._command_list): if self._idx >= len(self._command_list):
return False return False
pid = self.__find_free_process() pid, device = self.__find_free_process()
log_file_name = self._log_filename.replace(LOG_NAME_REPLACE_STR, str(self._idx + self._prev_run_cmd_length)) log_file_name = self._log_filename.replace(LOG_NAME_REPLACE_STR, str(self._idx + self._prev_run_cmd_length))
with open(log_file_name, "w") as log_file: with open(log_file_name, "w") as log_file:
self._workers[pid] = self.__create_thread(self.__update_process(pid, log_file)) self._workers[pid] = self.__create_thread(self.__update_process(pid, log_file, device))
self._workers[pid].join() self._workers[pid].join()
self._timers[pid] = datetime.datetime.now() self._timers[pid] = datetime.datetime.now()
self._idx += 1 self._idx += 1
@ -165,6 +200,12 @@ class TestParallelRunner:
self._is_save_cache = True self._is_save_cache = True
self._disabled_tests = list() self._disabled_tests = list()
self._total_test_cnt = 0 self._total_test_cnt = 0
self._available_devices = None
self._device = get_device_by_args(self._command.split())
if has_python_api:
self._available_devices = get_available_devices(self._device)
else:
self._available_devices = [self._device] if not self._device is None else []
def __init_basic_command_line_for_exec_file(self, test_command_line: list): def __init_basic_command_line_for_exec_file(self, test_command_line: list):
command = f'{self._exec_file_path}' command = f'{self._exec_file_path}'
@ -350,7 +391,7 @@ class TestParallelRunner:
def __execute_tests(self, filters: list(), prev_worker_cnt = 0): def __execute_tests(self, filters: list(), prev_worker_cnt = 0):
commands = [f'{self._command} --gtest_filter={filter}' for filter in filters] commands = [f'{self._command} --gtest_filter={filter}' for filter in filters]
task_manager = TaskManager(commands, self._working_dir, prev_worker_cnt) task_manager = TaskManager(commands, self._working_dir, prev_worker_cnt, self._device, self._available_devices)
for _ in progressbar(range(self._worker_num), "Worker initialization: ", 40): for _ in progressbar(range(self._worker_num), "Worker initialization: ", 40):
task_manager.init_worker() task_manager.init_worker()
for _ in progressbar(range(len(commands) - self._worker_num), "Worker execution: ", 40): for _ in progressbar(range(len(commands) - self._worker_num), "Worker execution: ", 40):
@ -362,6 +403,8 @@ class TestParallelRunner:
if TaskManager.process_timeout == -1: if TaskManager.process_timeout == -1:
TaskManager.process_timeout = DEFAULT_PROCESS_TIMEOUT TaskManager.process_timeout = DEFAULT_PROCESS_TIMEOUT
logger.info(f"Run test parallel is started. Worker num is {self._worker_num}") logger.info(f"Run test parallel is started. Worker num is {self._worker_num}")
if len(self._available_devices) > 1:
logger.info(f"Tests will be run over devices: {self._available_devices} instead of {self._device}")
t_start = datetime.datetime.now() t_start = datetime.datetime.now()
filters_cache, filters_runtime = self.__get_filters() filters_cache, filters_runtime = self.__get_filters()
@ -436,6 +479,11 @@ class TestParallelRunner:
test_cnt_expected = line.count(':') test_cnt_expected = line.count(':')
if constants.RUN in line: if constants.RUN in line:
test_name = line[line.find(constants.RUN) + len(constants.RUN) + 1:-1:] test_name = line[line.find(constants.RUN) + len(constants.RUN) + 1:-1:]
if self._device != None and self._available_devices != None:
for device_name in self._available_devices:
if device_name in test_name:
test_name = test_name.replace(device_name, self._device)
break
if constants.REF_COEF in line: if constants.REF_COEF in line:
ref_k = float(line[line.rfind(' ') + 1:]) ref_k = float(line[line.rfind(' ') + 1:])
if dir is None: if dir is None:

View File

@ -22,7 +22,7 @@ OS_BIN_FILE_EXT = ".exe" if IS_WIN else ""
ENV_SEPARATOR = ";" if IS_WIN else ":" ENV_SEPARATOR = ";" if IS_WIN else ":"
PYTHON_NAME = "python" if IS_WIN else "python3" PYTHON_NAME = "python" if IS_WIN else "python3"
PIP_NAME = "pip" if IS_WIN else "pip3" PIP_NAME = "pip" if IS_WIN else "pip3"
LD_LIB_PATH_NAME = "PATH" if IS_WIN or platform == "darwin" else "LD_LIBRARY_PATH" LD_LIB_PATH_NAME = "PATH" if IS_WIN else "LD_LIBRARY_PATH"
OPENVINO_NAME = 'openvino' OPENVINO_NAME = 'openvino'
PY_OPENVINO = "python_api" PY_OPENVINO = "python_api"

View File

@ -0,0 +1,43 @@
# Copyright (C) 2018-2023 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
try:
from openvino.runtime import Core
except:
from utils.file_utils import get_ov_path, find_latest_dir
import os
from utils.constants import PY_OPENVINO, LD_LIB_PATH_NAME
from utils.conformance_utils import get_logger, set_env_variable
logger = get_logger("get_available_device")
script_dir, _ = os.path.split(os.path.abspath(__file__))
ov_bin_path = get_ov_path(script_dir, None, True)
if PY_OPENVINO in os.listdir(ov_bin_path):
env = os.environ
py_ov = os.path.join(ov_bin_path, PY_OPENVINO)
py_ov = os.path.join(py_ov, find_latest_dir(py_ov))
env = set_env_variable(env, "PYTHONPATH", py_ov)
env = set_env_variable(env, LD_LIB_PATH_NAME, ov_bin_path)
logger.warning("Set the following env varibles to rename conformance ir based on hash: ")
logger.warning(f'PYTHONPATH={env["PYTHONPATH"]}')
logger.warning(f'{LD_LIB_PATH_NAME}={env[LD_LIB_PATH_NAME]}')
exit(0)
else:
logger.error(f'Impossible to run the tool! PyOpenVINO was not built!')
exit(-1)
def get_available_devices(target_device = None, exclude_device = None):
result = list()
core = Core()
if exclude_device is None:
exclude_device = "NOT_EXISTED_DEVICE"
for device in core.available_devices:
if target_device is None or target_device in device:
if exclude_device in device:
continue
result.append(device)
return result