From 9249ea799f10d7cc9c444d1e110b99814cec9780 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 4 Jul 2024 21:07:02 +0200 Subject: [PATCH 01/12] #2300: scripts: clean up JSON schema validation --- scripts/JSON_data_files_validator.py | 33 ++++++++++++++-------------- scripts/check_lb_data_files.sh | 2 ++ 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/scripts/JSON_data_files_validator.py b/scripts/JSON_data_files_validator.py index 9bed3eb368..c82b359245 100644 --- a/scripts/JSON_data_files_validator.py +++ b/scripts/JSON_data_files_validator.py @@ -12,6 +12,7 @@ from collections import Counter from collections.abc import Iterable import json +import logging import brotli from schema import And, Optional, Schema @@ -328,7 +329,7 @@ def __check_if_dir_exists(dir_path: str) -> bool: @staticmethod def __get_files_for_validation(dir_path: str, file_prefix: str, file_suffix: str) -> list: - """ Check for existence of a given directory. Returns True when file exists. """ + """ Get a sorted list of files from directory. """ list_of_files = os.listdir(dir_path) if not list_of_files: @@ -354,32 +355,30 @@ def __get_files_for_validation(dir_path: str, file_prefix: str, file_suffix: str @staticmethod def __validate_file(file_path): """ Validates the file against the schema. """ - print(f"Validating file: {file_path}") - with open(file_path, "rb") as compr_json_file: - compr_bytes = compr_json_file.read() - try: - decompr_bytes = brotli.decompress(compr_bytes) - decompressed_dict = json.loads(decompr_bytes.decode("utf-8")) - except brotli.error: - decompressed_dict = json.loads(compr_bytes.decode("utf-8")) + logging.info(f"Validating file: {file_path}") + with open(file_path, "rb") as json_file: + content = json_file.read() + if file_path.endswith('.br'): + content = brotli.decompress(content) + json_data = json.loads(content.decode("utf-8")) # Extracting type from JSON data schema_type = None - if decompressed_dict.get("metadata") is not None: - schema_type = decompressed_dict.get("metadata").get("type") + if json_data.get("metadata") is not None: + schema_type = json_data.get("metadata").get("type") else: - if decompressed_dict.get("type") is not None: - schema_type = decompressed_dict.get("type") + if json_data.get("type") is not None: + schema_type = json_data.get("type") if schema_type is not None: # Validate schema - if SchemaValidator(schema_type=schema_type).is_valid(schema_to_validate=decompressed_dict): - print(f"Valid JSON schema in {file_path}") + if SchemaValidator(schema_type=schema_type).is_valid(schema_to_validate=json_data): + logging.info(f"Valid JSON schema in {file_path}") else: print(f"Invalid JSON schema in {file_path}") - SchemaValidator(schema_type=schema_type).validate(schema_to_validate=decompressed_dict) + SchemaValidator(schema_type=schema_type).validate(schema_to_validate=json_data) else: - print(f"Schema type not found in file: {file_path}. \nPassing by default when schema type not found.") + logging.warning(f"Schema type not found in file: {file_path}. \nPassing by default when schema type not found.") def main(self): if self.__file_path is not None: diff --git a/scripts/check_lb_data_files.sh b/scripts/check_lb_data_files.sh index 66301ef535..34041cb18f 100755 --- a/scripts/check_lb_data_files.sh +++ b/scripts/check_lb_data_files.sh @@ -6,6 +6,8 @@ path_to_vt_build_dir=${1} path_to_vt_src_dir=${2} cd "$path_to_vt_build_dir" || exit 1 +set +x + function run_schema_validator() { file=$1 echo "Running schema validator on: $file" From b3eec245b1db7936648fbab2ca9a1654964321f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 4 Jul 2024 21:46:31 +0200 Subject: [PATCH 02/12] #2300: scripts: add `--validate_comm_links` option --- scripts/JSON_data_files_validator.py | 21 ++++++++++++++++----- scripts/check_lb_data_files.sh | 2 +- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/scripts/JSON_data_files_validator.py b/scripts/JSON_data_files_validator.py index c82b359245..6e1c5545e8 100644 --- a/scripts/JSON_data_files_validator.py +++ b/scripts/JSON_data_files_validator.py @@ -292,11 +292,14 @@ def validate(self, schema_to_validate: dict): class JSONDataFilesValidator: """ Class validating VT data files according do defined schema. """ - def __init__(self, file_path: str = None, dir_path: str = None, file_prefix: str = None, file_suffix: str = None): + def __init__(self, file_path: str = None, dir_path: str = None, + file_prefix: str = None, file_suffix: str = None, + validate_comm_links: bool = False): self.__file_path = file_path self.__dir_path = dir_path self.__file_prefix = file_prefix self.__file_suffix = file_suffix + self.__validate_comm_links = validate_comm_links self.__cli() def __cli(self): @@ -307,6 +310,7 @@ def __cli(self): group.add_argument("--file_path", help="Path to a validated file. Pass only when validating a single file.") parser.add_argument("--file_prefix", help="File prefix. Optional. Pass only when --dir_path is provided.") parser.add_argument("--file_suffix", help="File suffix. Optional. Pass only when --dir_path is provided.") + parser.add_argument("--validate_comm_links", help='Verify that comm links reference tasks.', action='store_true') args = parser.parse_args() if args.file_path: self.__file_path = os.path.abspath(args.file_path) @@ -316,6 +320,7 @@ def __cli(self): self.__file_prefix = args.file_prefix if args.file_suffix: self.__file_suffix = args.file_suffix + self.__validate_comm_links = args.validate_comm_links @staticmethod def __check_if_file_exists(file_path: str) -> bool: @@ -353,7 +358,7 @@ def __get_files_for_validation(dir_path: str, file_prefix: str, file_suffix: str key=lambda x: int(x.split(os.sep)[-1].split('.')[-2])) @staticmethod - def __validate_file(file_path): + def __validate_file(file_path, validate_comm_links): """ Validates the file against the schema. """ logging.info(f"Validating file: {file_path}") with open(file_path, "rb") as json_file: @@ -375,15 +380,20 @@ def __validate_file(file_path): if SchemaValidator(schema_type=schema_type).is_valid(schema_to_validate=json_data): logging.info(f"Valid JSON schema in {file_path}") else: - print(f"Invalid JSON schema in {file_path}") + logging.error(f"Invalid JSON schema in {file_path}") SchemaValidator(schema_type=schema_type).validate(schema_to_validate=json_data) else: logging.warning(f"Schema type not found in file: {file_path}. \nPassing by default when schema type not found.") + if validate_comm_links: + logging.error("FIXME: comm_links validation not implemented.") + + def main(self): if self.__file_path is not None: if self.__check_if_file_exists(file_path=self.__file_path): - self.__validate_file(file_path=self.__file_path) + self.__validate_file(file_path=self.__file_path, + validate_comm_links=self.__validate_comm_links) else: sys.excepthook = exc_handler raise FileNotFoundError(f"File: {self.__file_path} NOT found") @@ -393,7 +403,8 @@ def main(self): file_prefix=self.__file_prefix, file_suffix=self.__file_suffix) for file in list_of_files_for_validation: - self.__validate_file(file_path=file) + self.__validate_file(file_path=file, + validate_comm_links=self.__validate_comm_links) else: sys.excepthook = exc_handler raise FileNotFoundError(f"Directory: {self.__dir_path} does NOT exist") diff --git a/scripts/check_lb_data_files.sh b/scripts/check_lb_data_files.sh index 34041cb18f..4c37ac18dc 100755 --- a/scripts/check_lb_data_files.sh +++ b/scripts/check_lb_data_files.sh @@ -11,7 +11,7 @@ set +x function run_schema_validator() { file=$1 echo "Running schema validator on: $file" - if python3 "${path_to_vt_src_dir}/scripts/JSON_data_files_validator.py" --file_path="$file" + if python3 "${path_to_vt_src_dir}/scripts/JSON_data_files_validator.py" --file_path="$file" --validate_comm_links then echo "Valid file" else From e38a393209163ecb852c016174c3dc2136d2d44e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 4 Jul 2024 22:14:08 +0200 Subject: [PATCH 03/12] #2300: scripts: lower verbosity --- scripts/JSON_data_files_validator.py | 2 +- scripts/check_lb_data_files.sh | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/scripts/JSON_data_files_validator.py b/scripts/JSON_data_files_validator.py index 6e1c5545e8..f2c65b6711 100644 --- a/scripts/JSON_data_files_validator.py +++ b/scripts/JSON_data_files_validator.py @@ -386,7 +386,7 @@ def __validate_file(file_path, validate_comm_links): logging.warning(f"Schema type not found in file: {file_path}. \nPassing by default when schema type not found.") if validate_comm_links: - logging.error("FIXME: comm_links validation not implemented.") + logging.info("FIXME: comm_links validation not implemented.") def main(self): diff --git a/scripts/check_lb_data_files.sh b/scripts/check_lb_data_files.sh index 4c37ac18dc..656dcb2bba 100755 --- a/scripts/check_lb_data_files.sh +++ b/scripts/check_lb_data_files.sh @@ -6,14 +6,11 @@ path_to_vt_build_dir=${1} path_to_vt_src_dir=${2} cd "$path_to_vt_build_dir" || exit 1 -set +x - function run_schema_validator() { file=$1 - echo "Running schema validator on: $file" if python3 "${path_to_vt_src_dir}/scripts/JSON_data_files_validator.py" --file_path="$file" --validate_comm_links then - echo "Valid file" + echo "Valid JSON schema in $file" else >&2 echo "Invalid schema in $file.. exiting" exit 1; From d647e9f9d2a7bafb5985755d01c1b19a9928b2dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 9 Jul 2024 01:26:04 +0200 Subject: [PATCH 04/12] #2300: scripts: validate edges in single file --- scripts/JSON_data_files_validator.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scripts/JSON_data_files_validator.py b/scripts/JSON_data_files_validator.py index f2c65b6711..a6b9d3eeee 100644 --- a/scripts/JSON_data_files_validator.py +++ b/scripts/JSON_data_files_validator.py @@ -385,9 +385,18 @@ def __validate_file(file_path, validate_comm_links): else: logging.warning(f"Schema type not found in file: {file_path}. \nPassing by default when schema type not found.") - if validate_comm_links: - logging.info("FIXME: comm_links validation not implemented.") + if validate_comm_links and schema_type == "LBDatafile": + JSONDataFilesValidator.__validate_comm_links(json_data) + @staticmethod + def __validate_comm_links(data): + for phase in data["phases"]: + comm_ids = {int(comm["from"]["id"]) for comm in phase["communications"]} + comm_ids.update({int(comm["to"]["id"]) for comm in phase["communications"]}) + task_ids = {int(task["entity"]["id"]) for task in phase["tasks"]} + + if not comm_ids.issubset(task_ids): + logging.error(f" Phase {phase["id"]}: tasks {comm_ids - task_ids} were referenced in communication, but were not found.") def main(self): if self.__file_path is not None: From 901d26923c92b417db2e292c3d5dafe2e43b29d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 9 Jul 2024 14:43:49 +0200 Subject: [PATCH 05/12] #2300: scripts: validate edges in complete datasets --- scripts/JSON_data_files_validator.py | 40 ++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/scripts/JSON_data_files_validator.py b/scripts/JSON_data_files_validator.py index a6b9d3eeee..e1bfec0018 100644 --- a/scripts/JSON_data_files_validator.py +++ b/scripts/JSON_data_files_validator.py @@ -342,11 +342,11 @@ def __get_files_for_validation(dir_path: str, file_prefix: str, file_suffix: str raise FileNotFoundError(f"Directory: {dir_path} is EMPTY") if file_prefix is None and file_suffix is None: - print("File prefix and file suffix not given") + logging.info("File prefix and file suffix not given") file_prefix = Counter([file.split('.')[0] for file in list_of_files]).most_common()[0][0] - print(f"Found most common prefix: {file_prefix}") + logging.info(f"Found most common prefix: {file_prefix}") file_suffix = Counter([file.split('.')[-1] for file in list_of_files]).most_common()[0][0] - print(f"Found most common suffix: {file_suffix}") + logging.info(f"Found most common suffix: {file_suffix}") if file_prefix is not None: list_of_files = [file for file in list_of_files if file.split('.')[0] == file_prefix] @@ -358,14 +358,18 @@ def __get_files_for_validation(dir_path: str, file_prefix: str, file_suffix: str key=lambda x: int(x.split(os.sep)[-1].split('.')[-2])) @staticmethod - def __validate_file(file_path, validate_comm_links): - """ Validates the file against the schema. """ - logging.info(f"Validating file: {file_path}") + def __get_json(file_path): with open(file_path, "rb") as json_file: content = json_file.read() if file_path.endswith('.br'): content = brotli.decompress(content) - json_data = json.loads(content.decode("utf-8")) + return json.loads(content.decode("utf-8")) + + @staticmethod + def __validate_file(file_path, validate_comm_links): + """ Validates the file against the schema. """ + logging.info(f"Validating file: {file_path}") + json_data = JSONDataFilesValidator.__get_json(file_path) # Extracting type from JSON data schema_type = None @@ -386,7 +390,27 @@ def __validate_file(file_path, validate_comm_links): logging.warning(f"Schema type not found in file: {file_path}. \nPassing by default when schema type not found.") if validate_comm_links and schema_type == "LBDatafile": - JSONDataFilesValidator.__validate_comm_links(json_data) + basename = os.path.basename(file_path) + dirname = os.path.dirname(file_path) + digits = ''.join(filter(lambda c: c.isdigit(), basename)) + has_number = digits.isnumeric() + + if not has_number: + JSONDataFilesValidator.__validate_comm_links(json_data) + elif int(digits) == 0: + files = JSONDataFilesValidator.__get_files_for_validation(dirname, None, None) + all_data = [JSONDataFilesValidator.__get_json(file) for file in files] + + comm_ids = set() + task_ids = set() + for data in all_data: + for phase in data["phases"]: + comm_ids.update({int(comm["from"]["id"]) for comm in phase["communications"]} ) + comm_ids.update({int(comm["to"]["id"]) for comm in phase["communications"]}) + task_ids.update({int(task["entity"]["id"]) for task in phase["tasks"]}) + if not comm_ids.issubset(task_ids): + logging.error(f" Phase {phase["id"]}: tasks {comm_ids - task_ids} were referenced in communication, but were not found.") + @staticmethod def __validate_comm_links(data): From 64f491fb276395d6e1c551d309cdf0cf24cc0eb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 9 Jul 2024 15:54:01 +0200 Subject: [PATCH 06/12] #2300: scripts: remove redundant code --- scripts/JSON_data_files_validator.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/scripts/JSON_data_files_validator.py b/scripts/JSON_data_files_validator.py index e1bfec0018..509b51ff2a 100644 --- a/scripts/JSON_data_files_validator.py +++ b/scripts/JSON_data_files_validator.py @@ -322,16 +322,6 @@ def __cli(self): self.__file_suffix = args.file_suffix self.__validate_comm_links = args.validate_comm_links - @staticmethod - def __check_if_file_exists(file_path: str) -> bool: - """ Check for existence of a given file. Returns True when file exists. """ - return os.path.isfile(file_path) - - @staticmethod - def __check_if_dir_exists(dir_path: str) -> bool: - """ Check for existence of a given directory. Returns True when file exists. """ - return os.path.isdir(dir_path) - @staticmethod def __get_files_for_validation(dir_path: str, file_prefix: str, file_suffix: str) -> list: """ Get a sorted list of files from directory. """ @@ -424,14 +414,14 @@ def __validate_comm_links(data): def main(self): if self.__file_path is not None: - if self.__check_if_file_exists(file_path=self.__file_path): + if os.path.isfile(self.__file_path): self.__validate_file(file_path=self.__file_path, validate_comm_links=self.__validate_comm_links) else: sys.excepthook = exc_handler raise FileNotFoundError(f"File: {self.__file_path} NOT found") elif self.__dir_path is not None: - if self.__check_if_dir_exists(dir_path=self.__dir_path): + if os.path.isdir(self.__dir_path): list_of_files_for_validation = self.__get_files_for_validation(dir_path=self.__dir_path, file_prefix=self.__file_prefix, file_suffix=self.__file_suffix) From 09f57ceb39cbc5950e32190f6668739bde4db038 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 9 Jul 2024 16:34:00 +0200 Subject: [PATCH 07/12] #2300: scripts: general cleanup and error reporting --- scripts/JSON_data_files_validator.py | 83 +++++++++++++++------------- 1 file changed, 45 insertions(+), 38 deletions(-) diff --git a/scripts/JSON_data_files_validator.py b/scripts/JSON_data_files_validator.py index 509b51ff2a..a4ecb061d2 100644 --- a/scripts/JSON_data_files_validator.py +++ b/scripts/JSON_data_files_validator.py @@ -290,6 +290,13 @@ def validate(self, schema_to_validate: dict): return self.valid_schema.validate(schema_to_validate) +def get_json(file_path): + with open(file_path, "rb") as json_file: + content = json_file.read() + if file_path.endswith('.br'): + content = brotli.decompress(content) + return json.loads(content.decode("utf-8")) + class JSONDataFilesValidator: """ Class validating VT data files according do defined schema. """ def __init__(self, file_path: str = None, dir_path: str = None, @@ -347,19 +354,10 @@ def __get_files_for_validation(dir_path: str, file_prefix: str, file_suffix: str return sorted([os.path.join(dir_path, file) for file in list_of_files], key=lambda x: int(x.split(os.sep)[-1].split('.')[-2])) - @staticmethod - def __get_json(file_path): - with open(file_path, "rb") as json_file: - content = json_file.read() - if file_path.endswith('.br'): - content = brotli.decompress(content) - return json.loads(content.decode("utf-8")) - - @staticmethod - def __validate_file(file_path, validate_comm_links): + def __validate_file(self, file_path): """ Validates the file against the schema. """ logging.info(f"Validating file: {file_path}") - json_data = JSONDataFilesValidator.__get_json(file_path) + json_data = get_json(file_path) # Extracting type from JSON data schema_type = None @@ -377,46 +375,56 @@ def __validate_file(file_path, validate_comm_links): logging.error(f"Invalid JSON schema in {file_path}") SchemaValidator(schema_type=schema_type).validate(schema_to_validate=json_data) else: - logging.warning(f"Schema type not found in file: {file_path}. \nPassing by default when schema type not found.") + logging.warning(f"Schema type not found in file: {file_path}. \n" + "Passing by default when schema type not found.") - if validate_comm_links and schema_type == "LBDatafile": + if self.__validate_comm_links and schema_type == "LBDatafile": basename = os.path.basename(file_path) - dirname = os.path.dirname(file_path) digits = ''.join(filter(lambda c: c.isdigit(), basename)) - has_number = digits.isnumeric() - if not has_number: - JSONDataFilesValidator.__validate_comm_links(json_data) + all_jsons = [] + if not digits.isnumeric(): + # validate single file + all_jsons = [json_data] elif int(digits) == 0: - files = JSONDataFilesValidator.__get_files_for_validation(dirname, None, None) - all_data = [JSONDataFilesValidator.__get_json(file) for file in files] + # validate complete dataset + dirname = os.path.dirname(file_path) + files = self.__get_files_for_validation(dirname, None, None) + all_jsons = [get_json(file) for file in files] + else: + # only datasets starting with 0 + return - comm_ids = set() - task_ids = set() - for data in all_data: - for phase in data["phases"]: - comm_ids.update({int(comm["from"]["id"]) for comm in phase["communications"]} ) - comm_ids.update({int(comm["to"]["id"]) for comm in phase["communications"]}) - task_ids.update({int(task["entity"]["id"]) for task in phase["tasks"]}) - if not comm_ids.issubset(task_ids): - logging.error(f" Phase {phase["id"]}: tasks {comm_ids - task_ids} were referenced in communication, but were not found.") + if not self.validate_comm_links(all_jsons): + logging.error(f" Invalid dataset: {files}") @staticmethod - def __validate_comm_links(data): - for phase in data["phases"]: - comm_ids = {int(comm["from"]["id"]) for comm in phase["communications"]} - comm_ids.update({int(comm["to"]["id"]) for comm in phase["communications"]}) - task_ids = {int(task["entity"]["id"]) for task in phase["tasks"]} + def validate_comm_links(all_jsons): + for n in range(len(all_jsons[0]["phases"])): + comm_ids = set() + task_ids = set() + + for data in all_jsons: + comms = data["phases"][n]["communications"] + tasks = data["phases"][n]["tasks"] + comm_ids.update({int(comm["from"]["id"]) for comm in comms}) + comm_ids.update({int(comm["to"]["id"]) for comm in comms}) + task_ids.update({int(task["entity"]["id"]) for task in tasks}) if not comm_ids.issubset(task_ids): - logging.error(f" Phase {phase["id"]}: tasks {comm_ids - task_ids} were referenced in communication, but were not found.") + logging.error( + f" Phase {n}: Task ids: {comm_ids - task_ids}. Tasks are " + "referenced in communication, but are not present in the " + "dataset." + ) + return False + return True def main(self): if self.__file_path is not None: if os.path.isfile(self.__file_path): - self.__validate_file(file_path=self.__file_path, - validate_comm_links=self.__validate_comm_links) + self.__validate_file(file_path=self.__file_path) else: sys.excepthook = exc_handler raise FileNotFoundError(f"File: {self.__file_path} NOT found") @@ -426,8 +434,7 @@ def main(self): file_prefix=self.__file_prefix, file_suffix=self.__file_suffix) for file in list_of_files_for_validation: - self.__validate_file(file_path=file, - validate_comm_links=self.__validate_comm_links) + self.__validate_file(file_path=file) else: sys.excepthook = exc_handler raise FileNotFoundError(f"Directory: {self.__dir_path} does NOT exist") From 941676f87728d90a7b6168ea139d9819e111a4ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 19 Jul 2024 15:47:21 +0200 Subject: [PATCH 08/12] #2300: scripts: always try to decompress --- scripts/JSON_data_files_validator.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/JSON_data_files_validator.py b/scripts/JSON_data_files_validator.py index a4ecb061d2..7b3a9ce58f 100644 --- a/scripts/JSON_data_files_validator.py +++ b/scripts/JSON_data_files_validator.py @@ -291,10 +291,13 @@ def validate(self, schema_to_validate: dict): def get_json(file_path): + """ Always try to decompress in case '.br' extension is missing. """ with open(file_path, "rb") as json_file: content = json_file.read() - if file_path.endswith('.br'): + try: content = brotli.decompress(content) + except Exception: + pass return json.loads(content.decode("utf-8")) class JSONDataFilesValidator: From 88ace283bb3f96419b04176bde1301498eb3f723 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 19 Jul 2024 16:39:55 +0200 Subject: [PATCH 09/12] #2300: scripts: fix dataset detection --- scripts/JSON_data_files_validator.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/scripts/JSON_data_files_validator.py b/scripts/JSON_data_files_validator.py index 7b3a9ce58f..0d3e9876f8 100644 --- a/scripts/JSON_data_files_validator.py +++ b/scripts/JSON_data_files_validator.py @@ -1,4 +1,5 @@ import os +import re import sys try: @@ -382,23 +383,30 @@ def __validate_file(self, file_path): "Passing by default when schema type not found.") if self.__validate_comm_links and schema_type == "LBDatafile": + # FIXME: extract into a method basename = os.path.basename(file_path) - digits = ''.join(filter(lambda c: c.isdigit(), basename)) + numbers = re.findall(r'\d+', basename) - all_jsons = [] - if not digits.isnumeric(): + if not numbers: # validate single file + files = [file_path] all_jsons = [json_data] - elif int(digits) == 0: + elif numbers[-1] == '0': # validate complete dataset dirname = os.path.dirname(file_path) - files = self.__get_files_for_validation(dirname, None, None) + index = basename.rfind('0') + base = basename[0:index] + #FIXME: files = get_complete_dataset... + files = [os.path.join(dirname, f) for f in os.listdir(dirname) + if f.startswith(base)] + print(files) #REMOVE_ME / logging all_jsons = [get_json(file) for file in files] else: - # only datasets starting with 0 + # this dataset is already validated return if not self.validate_comm_links(all_jsons): + # FIXME: could be undefined logging.error(f" Invalid dataset: {files}") @@ -409,10 +417,13 @@ def validate_comm_links(all_jsons): task_ids = set() for data in all_jsons: + #FIXME: KeyErorr: 'communications' + #if data... get("communications") is not None: comms = data["phases"][n]["communications"] - tasks = data["phases"][n]["tasks"] comm_ids.update({int(comm["from"]["id"]) for comm in comms}) comm_ids.update({int(comm["to"]["id"]) for comm in comms}) + + tasks = data["phases"][n]["tasks"] task_ids.update({int(task["entity"]["id"]) for task in tasks}) if not comm_ids.issubset(task_ids): From b973f1e237d95faab4eea7306413b0c304e5232b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 23 Jul 2024 18:25:24 +0200 Subject: [PATCH 10/12] #2300: scripts: check if comm data exists --- scripts/JSON_data_files_validator.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/scripts/JSON_data_files_validator.py b/scripts/JSON_data_files_validator.py index 0d3e9876f8..5af5f6f154 100644 --- a/scripts/JSON_data_files_validator.py +++ b/scripts/JSON_data_files_validator.py @@ -417,11 +417,10 @@ def validate_comm_links(all_jsons): task_ids = set() for data in all_jsons: - #FIXME: KeyErorr: 'communications' - #if data... get("communications") is not None: - comms = data["phases"][n]["communications"] - comm_ids.update({int(comm["from"]["id"]) for comm in comms}) - comm_ids.update({int(comm["to"]["id"]) for comm in comms}) + if data["phases"][n].get("communications") is not None: + comms = data["phases"][n]["communications"] + comm_ids.update({int(comm["from"]["id"]) for comm in comms}) + comm_ids.update({int(comm["to"]["id"]) for comm in comms}) tasks = data["phases"][n]["tasks"] task_ids.update({int(task["entity"]["id"]) for task in tasks}) From 4ba78924f968341ec75e78c7e0de55f3bd330765 Mon Sep 17 00:00:00 2001 From: Jacob Domagala Date: Thu, 25 Jul 2024 11:32:42 +0200 Subject: [PATCH 11/12] #2300: Check only for files that end with .json or .json.br --- scripts/JSON_data_files_validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/JSON_data_files_validator.py b/scripts/JSON_data_files_validator.py index 5af5f6f154..fd96966464 100644 --- a/scripts/JSON_data_files_validator.py +++ b/scripts/JSON_data_files_validator.py @@ -398,7 +398,7 @@ def __validate_file(self, file_path): base = basename[0:index] #FIXME: files = get_complete_dataset... files = [os.path.join(dirname, f) for f in os.listdir(dirname) - if f.startswith(base)] + if f.startswith(base) and (f.endswith(".json") or f.endswith(".json.br"))] print(files) #REMOVE_ME / logging all_jsons = [get_json(file) for file in files] else: From 898ced8573ea99c0de30d4c4c74fa9c68675dc48 Mon Sep 17 00:00:00 2001 From: Jacob Domagala Date: Thu, 25 Jul 2024 20:02:26 +0200 Subject: [PATCH 12/12] #2300: Code cleanup --- scripts/JSON_data_files_validator.py | 54 ++++++++++++++++++---------- scripts/check_lb_data_files.sh | 5 +-- 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/scripts/JSON_data_files_validator.py b/scripts/JSON_data_files_validator.py index fd96966464..67abb5f688 100644 --- a/scripts/JSON_data_files_validator.py +++ b/scripts/JSON_data_files_validator.py @@ -297,8 +297,8 @@ def get_json(file_path): content = json_file.read() try: content = brotli.decompress(content) - except Exception: - pass + except brotli.error as e: + logging.debug(f"No decompression applied for {file_path}: {e}") return json.loads(content.decode("utf-8")) class JSONDataFilesValidator: @@ -322,6 +322,7 @@ def __cli(self): parser.add_argument("--file_prefix", help="File prefix. Optional. Pass only when --dir_path is provided.") parser.add_argument("--file_suffix", help="File suffix. Optional. Pass only when --dir_path is provided.") parser.add_argument("--validate_comm_links", help='Verify that comm links reference tasks.', action='store_true') + parser.add_argument("--debug", help="Enable debug logging", action="store_true") args = parser.parse_args() if args.file_path: self.__file_path = os.path.abspath(args.file_path) @@ -332,6 +333,7 @@ def __cli(self): if args.file_suffix: self.__file_suffix = args.file_suffix self.__validate_comm_links = args.validate_comm_links + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO, format='%(levelname)s - %(filename)s:%(lineno)d - %(message)s') @staticmethod def __get_files_for_validation(dir_path: str, file_prefix: str, file_suffix: str) -> list: @@ -358,6 +360,31 @@ def __get_files_for_validation(dir_path: str, file_prefix: str, file_suffix: str return sorted([os.path.join(dir_path, file) for file in list_of_files], key=lambda x: int(x.split(os.sep)[-1].split('.')[-2])) + @staticmethod + def get_complete_dataset(file_path): + """ Returns all json files that share the same basename. """ + dirname = os.path.dirname(file_path) + basename = os.path.basename(file_path) + index = basename.rfind('0') + base = basename[0:index] + files = [os.path.join(dirname, f) for f in os.listdir(dirname) + if f.startswith(base) and (f.endswith(".json") or f.endswith(".json.br"))] + logging.debug(f"Dataset: {files}") + + return files + + @staticmethod + def get_nodes_info(file_path): + """ Returns node information from file name. """ + basename = os.path.basename(file_path) + nodes_info = re.findall(r'\d+', basename) + if not nodes_info: + return '-1', '-1' + elif len(nodes_info) == 1: + return '-1', nodes_info[0] + else: + return nodes_info[0], nodes_info[1] + def __validate_file(self, file_path): """ Validates the file against the schema. """ logging.info(f"Validating file: {file_path}") @@ -383,31 +410,20 @@ def __validate_file(self, file_path): "Passing by default when schema type not found.") if self.__validate_comm_links and schema_type == "LBDatafile": - # FIXME: extract into a method - basename = os.path.basename(file_path) - numbers = re.findall(r'\d+', basename) - - if not numbers: + num_nodes, current_node = self.get_nodes_info(file_path) + if num_nodes == '-1' and current_node == '-1': # validate single file - files = [file_path] all_jsons = [json_data] - elif numbers[-1] == '0': + elif current_node == '0': # validate complete dataset - dirname = os.path.dirname(file_path) - index = basename.rfind('0') - base = basename[0:index] - #FIXME: files = get_complete_dataset... - files = [os.path.join(dirname, f) for f in os.listdir(dirname) - if f.startswith(base) and (f.endswith(".json") or f.endswith(".json.br"))] - print(files) #REMOVE_ME / logging - all_jsons = [get_json(file) for file in files] + dataset_files = self.get_complete_dataset(file_path) + all_jsons = [get_json(file) for file in dataset_files] else: # this dataset is already validated return if not self.validate_comm_links(all_jsons): - # FIXME: could be undefined - logging.error(f" Invalid dataset: {files}") + logging.error(f" Invalid dataset for file: {file_path}!") @staticmethod diff --git a/scripts/check_lb_data_files.sh b/scripts/check_lb_data_files.sh index 656dcb2bba..b88575b4d9 100755 --- a/scripts/check_lb_data_files.sh +++ b/scripts/check_lb_data_files.sh @@ -8,11 +8,8 @@ cd "$path_to_vt_build_dir" || exit 1 function run_schema_validator() { file=$1 - if python3 "${path_to_vt_src_dir}/scripts/JSON_data_files_validator.py" --file_path="$file" --validate_comm_links + if ! python3 "${path_to_vt_src_dir}/scripts/JSON_data_files_validator.py" --file_path="$file" --validate_comm_links then - echo "Valid JSON schema in $file" - else - >&2 echo "Invalid schema in $file.. exiting" exit 1; fi }