Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validate config changes #552

Open
wants to merge 3 commits into
base: issue-511
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 49 additions & 38 deletions isatools/isajson/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,11 +693,12 @@ def check_measurement_technology_types(assay_json, configs):
)


def check_study_and_assay_graphs(study_json, configs):
def check_assay_graph(process_sequence_json, config):
def check_study_and_assay_graphs(study_json, configs, no_config):
def check_assay_graph(process_sequence_json, config, no_config):
list_of_last_processes_in_sequence = [i for i in process_sequence_json if "nextProcess" not in i.keys()]
log.info("Checking against assay protocol sequence configuration {}".format(config["description"]))
config_protocol_sequence = [i["protocol"] for i in config["protocols"]]
if not no_config:
log.info("Checking against assay protocol sequence configuration {}".format(config["description"]))
config_protocol_sequence = [i["protocol"] for i in config["protocols"]]
for process in list_of_last_processes_in_sequence: # build graphs backwards
assay_graph = list()
try:
Expand Down Expand Up @@ -727,40 +728,48 @@ def check_assay_graph(process_sequence_json, config):
break
except KeyError: # this happens when we can"t find a previousProcess
pass
assay_graph.reverse()
assay_protocol_sequence = [[j for j in i if not j.startswith("#")] for i in assay_graph]
assay_protocol_sequence = [i for j in assay_protocol_sequence for i in j] # flatten list
assay_protocol_sequence_of_interest = [i for i in assay_protocol_sequence if i in config_protocol_sequence]
# filter out protocols in sequence that are not of interest (additional ones to required by config)
squished_assay_protocol_sequence_of_interest = list()
prev_prot = None
for prot in assay_protocol_sequence_of_interest: # remove consecutive same protocols
if prev_prot != prot:
squished_assay_protocol_sequence_of_interest.append(prot)
prev_prot = prot
from isatools.utils import contains
if not contains(squished_assay_protocol_sequence_of_interest, config_protocol_sequence):
warnings.append({
"message": "Process sequence is not valid against configuration",
"supplemental": "Config protocol sequence {} does not in assay protocol sequence {}".format(
config_protocol_sequence,
squished_assay_protocol_sequence_of_interest),
"code": 4004
})
log.warning("Configuration protocol sequence {} does not match study graph found in {}"
.format(config_protocol_sequence, assay_protocol_sequence))

if not no_config:
assay_graph.reverse()
assay_protocol_sequence = [[j for j in i if not j.startswith("#")] for i in assay_graph]
assay_protocol_sequence = [i for j in assay_protocol_sequence for i in j] # flatten list
assay_protocol_sequence_of_interest = [i for i in assay_protocol_sequence if i in config_protocol_sequence]
# filter out protocols in sequence that are not of interest (additional ones to required by config)
squished_assay_protocol_sequence_of_interest = list()
prev_prot = None
for prot in assay_protocol_sequence_of_interest: # remove consecutive same protocols
if prev_prot != prot:
squished_assay_protocol_sequence_of_interest.append(prot)
prev_prot = prot
from isatools.utils import contains
if not contains(squished_assay_protocol_sequence_of_interest, config_protocol_sequence):
warnings.append({
"message": "Process sequence is not valid against configuration",
"supplemental": "Config protocol sequence {} does not in assay protocol sequence {}".format(
config_protocol_sequence,
squished_assay_protocol_sequence_of_interest),
"code": 4004
})
log.warning("Configuration protocol sequence {} does not match study graph found in {}"
.format(config_protocol_sequence, assay_protocol_sequence))

protocols_and_types = dict([(i["@id"], i["protocolType"]["annotationValue"]) for i in study_json["protocols"]])
# first check study graph
log.info("Loading configuration (study)")
config = configs["study"]
check_assay_graph(study_json["processSequence"], config)
if not no_config:
log.info("Loading configuration (study)")
config = configs["study"]
else:
config = {}
check_assay_graph(study_json["processSequence"], config, no_config)
for assay_json in study_json["assays"]:
m = assay_json["measurementType"]["annotationValue"]
t = assay_json["technologyType"]["annotationValue"]
log.info("Loading configuration ({}, {})".format(m, t))
config = configs[(m, t)]
check_assay_graph(assay_json["processSequence"], config)
if not no_config:
log.info("Loading configuration ({}, {})".format(m, t))
config = configs[(m, t)]
else:
config = {}
check_assay_graph(assay_json["processSequence"], config, no_config)


def check_study_groups(study_or_assay):
Expand Down Expand Up @@ -811,7 +820,8 @@ def validate(
fp,
config_dir=default_config_dir,
log_level=None,
base_schemas_dir="isa_model_version_1_0_schemas"
base_schemas_dir="isa_model_version_1_0_schemas",
no_config: bool = False
):
if config_dir is None:
config_dir = default_config_dir
Expand Down Expand Up @@ -887,10 +897,11 @@ def validate(
check_term_accession_used_no_source_ref(isa_json) # Rule 3010
log.info("Loading configurations from " + config_dir)
configs = load_config(config_dir) # Rule 4001
log.info("Checking measurement and technology types...")
for study_json in isa_json["studies"]:
for assay_json in study_json["assays"]:
check_measurement_technology_types(assay_json, configs) # Rule 4002
if not no_config:
log.info("Checking measurement and technology types...")
for study_json in isa_json["studies"]:
for assay_json in study_json["assays"]:
check_measurement_technology_types(assay_json, configs) # Rule 4002
log.info("Checking against configuration schemas...")
check_isa_schemas(
isa_json=isa_json,
Expand All @@ -907,7 +918,7 @@ def validate(
fp.seek(0) # reset file pointer
log.info("Checking study and assay graphs...")
for study_json in isa_json["studies"]:
check_study_and_assay_graphs(study_json, configs) # Rule 4004
check_study_and_assay_graphs(study_json, configs, no_config) # Rule 4004
fp.seek(0)
# try load and do study groups check
log.info("Checking study groups...")
Expand Down
5 changes: 4 additions & 1 deletion isatools/isatab/validate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,14 +169,16 @@ def validate(fp: TextIO,
config_dir: str = default_config_dir,
origin: str or None = None,
rules: dict = None,
log_level=None) -> dict:
log_level=None,
no_config: bool = False) -> dict:
"""
A function to validate an ISA investigation tab file
:param fp: the investigation file handler
:param config_dir: the XML configuration directory
:param origin: value accepted = mzml2isa or None
:param rules: optional rules to run (default: all rules)
:param log_level: optional log level (default: INFO)
:param no_config: whether or not to validate against configs (default: False)
:return: a dictionary of the validation results (errors, warnings and info)
"""
if not log_level:
Expand All @@ -191,6 +193,7 @@ def validate(fp: TextIO,
"investigation_df_dict": i_df_dict,
"dir_context": path.dirname(fp.name),
"configs": config_dir,
"no_config": no_config
}
investigation_validator = ISAInvestigationValidator(**params, **built_rules['investigation'])

Expand Down
21 changes: 15 additions & 6 deletions isatools/isatab/validate/rules/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from pandas import DataFrame

from isatools.io import isatab_configurator
from isatools.utils import utf8_text_file_open
from isatools.isatab.defaults import NUMBER_OF_STUDY_GROUPS
from isatools.isatab.load import load_table
Expand Down Expand Up @@ -112,22 +113,25 @@ def __init__(self,
dir_context: str,
configs: str,
available_rules: list = INVESTIGATION_RULES_MAPPING,
rules_to_run: tuple = DEFAULT_INVESTIGATION_RULES):
rules_to_run: tuple = DEFAULT_INVESTIGATION_RULES,
no_config: bool = False):
""" The ISA investigation validator class

:param investigation_df_dict: a dictionary of DataFrames and lists of DataFrames representing the investigation file
:param dir_context: the directory of the investigation
:param configs: directory of the XML config files
:param available_rules: a customizable list of all available rules for investigation objects
:param rules_to_run: a customizable tuple of rules identifiers to run for investigation objects
:param no_config: whether or not to validate against configs (default: False)
"""
self.all_rules = Rules(rules_to_run=rules_to_run, available_rules=available_rules)
self.has_validated = False
self.params = {
'investigation_df_dict': investigation_df_dict,
'dir_context': dir_context,
'configs': configs,
'term_source_refs': None
'term_source_refs': None,
"no_config": no_config
}
self.all_rules.validate_rules(validator=self)

Expand All @@ -140,7 +144,8 @@ def __init__(self,
study_filename: str,
study_df: DataFrame,
available_rules: List = STUDY_RULES_MAPPING,
rules_to_run: tuple = DEFAULT_STUDY_RULES):
rules_to_run: tuple = DEFAULT_STUDY_RULES,
no_config: bool = False):
"""
The ISA study validator class
:param validator: the investigation validator
Expand All @@ -149,13 +154,15 @@ def __init__(self,
:param study_df: the study dataframe
:param available_rules: a customizable list of all available rules for investigation objects
:param rules_to_run: a customizable tuple of rules identifiers to run for investigation objects
:param no_config: whether or not to validate against configs (default: False)
"""
self.all_rules = Rules(rules_to_run=rules_to_run, available_rules=available_rules)
self.has_validated = False
self.params = {
**validator.params,
'study_df': study_df,
'config': validator.params['configs'][('[sample]', '')],
'config': validator.params['configs'][('[sample]', '')] if ('[sample]', '') in validator.params['configs']
else isatab_configurator.IsaTabConfigFileType(),
'study_filename': study_filename
}
with utf8_text_file_open(path.join(self.params['dir_context'], study_filename)) as s_fp:
Expand Down Expand Up @@ -183,7 +190,8 @@ def __init__(self,
assay_filename: str = None,
assay_df: DataFrame = None,
available_rules: List = ASSAY_RULES_MAPPING,
rules_to_run: tuple = DEFAULT_ASSAY_RULES):
rules_to_run: tuple = DEFAULT_ASSAY_RULES,
no_config: bool = False):
"""
The ISA assay validator class
:param assay_tables: list of assay tables
Expand All @@ -193,6 +201,7 @@ def __init__(self,
:param assay_df: the assay dataframe
:param available_rules: a customizable list of all available rules for investigation objects
:param rules_to_run: a customizable tuple of rules identifiers to run for investigation objects
:param no_config: whether or not to validate against configs (default: False)
"""
self.all_rules = Rules(rules_to_run=rules_to_run, available_rules=available_rules)
self.has_validated = False
Expand All @@ -207,7 +216,7 @@ def __init__(self,
if assay_filename != '':
lowered_mt = assay_df['Study Assay Measurement Type'].tolist()[assay_index].lower()
lowered_tt = assay_df['Study Assay Technology Type'].tolist()[assay_index].lower()
self.params['config'] = self.params['configs'].get((lowered_mt, lowered_tt), None)
self.params['config'] = self.params['configs'].get((lowered_mt, lowered_tt), isatab_configurator.IsaTabConfigFileType())
if self.params['config']:
with utf8_text_file_open(path.join(self.params['dir_context'], assay_filename)) as a_fp:
self.params['assay_table'] = load_table(a_fp)
Expand Down
34 changes: 17 additions & 17 deletions isatools/isatab/validate/rules/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@
{'rule': check_pubmed_ids_format, 'params': ['investigation_df_dict'], 'identifier': '3003'},
{'rule': check_ontology_sources, 'params': ['investigation_df_dict'], 'identifier': '3008'},

{'rule': load_config, 'params': ['configs'], 'identifier': '4001'},
{'rule': check_measurement_technology_types, 'params': ['investigation_df_dict', 'configs'], 'identifier': '4002'},
{'rule': check_investigation_against_config, 'params': ['investigation_df_dict', 'configs'], 'identifier': '4003'},
{'rule': load_config, 'params': ['configs', 'no_config'], 'identifier': '4001'},
{'rule': check_measurement_technology_types, 'params': ['investigation_df_dict', 'configs', 'no_config'], 'identifier': '4002'},
{'rule': check_investigation_against_config, 'params': ['investigation_df_dict', 'configs', 'no_config'], 'identifier': '4003'},

# copies
{'rule': check_table_files_read, 'params': ['investigation_df_dict', 'dir_context'], 'identifier': '0008'},
Expand All @@ -58,22 +58,22 @@

STUDY_RULES_MAPPING = [

{'rule': check_unit_field, 'params': ['study_sample_table', 'config'], 'identifier': '1099'},
{'rule': check_unit_field, 'params': ['study_sample_table', 'config', 'no_config'], 'identifier': '1099'},

{
'rule': check_ontology_fields,
'params': ['study_sample_table', 'config', 'term_source_refs'],
'params': ['study_sample_table', 'config', 'term_source_refs', 'no_config'],
'identifier': '3010'
},

{'rule': check_required_fields, 'params': ['study_sample_table', 'config'], 'identifier': '4003'},
{'rule': check_required_fields, 'params': ['study_sample_table', 'config', 'no_config'], 'identifier': '4003'},
{'rule': check_factor_value_presence, 'params': ['study_sample_table'], 'identifier': '4007'},
{
'rule': check_protocol_fields,
'params': ['study_sample_table', 'config', 'protocol_names_and_types'],
'params': ['study_sample_table', 'config', 'protocol_names_and_types', 'no_config'],
'identifier': '4009'
},
{'rule': check_field_values, 'params': ['study_sample_table', 'config'], 'identifier': '4011'},
{'rule': check_field_values, 'params': ['study_sample_table', 'config', 'no_config'], 'identifier': '4011'},
{'rule': load_table_checks, 'params': ['study_sample_table', 'study_filename'], 'identifier': '4014'},

{
Expand All @@ -83,30 +83,30 @@
},

# copies
{'rule': check_required_fields, 'params': ['study_sample_table', 'config'], 'identifier': '4008'},
{'rule': check_required_fields, 'params': ['study_sample_table', 'config'], 'identifier': '4010'},
{'rule': check_required_fields, 'params': ['study_sample_table', 'config', 'no_config'], 'identifier': '4008'},
{'rule': check_required_fields, 'params': ['study_sample_table', 'config', 'no_config'], 'identifier': '4010'},
]

ASSAY_RULES_MAPPING = [
{'rule': check_sample_names, 'params': ['study_sample_table', 'assay_tables'], 'identifier': '0000'},

{'rule': check_unit_field, 'params': ['assay_table', 'config'], 'identifier': '1099'},
{'rule': check_unit_field, 'params': ['assay_table', 'config', 'no_config'], 'identifier': '1099'},

{'rule': check_ontology_fields, 'params': ['assay_table', 'config', 'term_source_refs'], 'identifier': '3010'},
{'rule': check_ontology_fields, 'params': ['assay_table', 'config', 'term_source_refs', 'no_config'], 'identifier': '3010'},

{'rule': check_required_fields, 'params': ['assay_table', 'config'], 'identifier': '4003'},
{'rule': check_required_fields, 'params': ['assay_table', 'config', 'no_config'], 'identifier': '4003'},
{'rule': check_factor_value_presence, 'params': ['assay_table'], 'identifier': '4007'},
{
'rule': check_protocol_fields,
'params': ['assay_table', 'config', 'protocol_names_and_types'],
'params': ['assay_table', 'config', 'protocol_names_and_types', 'no_config'],
'identifier': '4009'
},
{'rule': check_field_values, 'params': ['assay_table', 'config'], 'identifier': '4011'},
{'rule': check_field_values, 'params': ['assay_table', 'config', 'no_config'], 'identifier': '4011'},
{'rule': load_table_checks, 'params': ['assay_table', 'assay_filename'], 'identifier': '4014'},

# copies
{'rule': check_required_fields, 'params': ['study_sample_table', 'config'], 'identifier': '4008'},
{'rule': check_required_fields, 'params': ['study_sample_table', 'config'], 'identifier': '4010'},
{'rule': check_required_fields, 'params': ['study_sample_table', 'config', 'no_config'], 'identifier': '4008'},
{'rule': check_required_fields, 'params': ['study_sample_table', 'config', 'no_config'], 'identifier': '4010'},

{
'rule': check_study_groups,
Expand Down
Loading
Loading