Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more robust hedID validation, minor ontology code tweaks #922

Merged
merged 1 commit into from
May 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 40 additions & 12 deletions hed/schema/schema_io/ontology_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,10 @@ def _get_hedid_range(schema_name, df_key):
starting_id, ending_id = library_index_ranges[schema_name]

start_object_range, end_object_range = object_type_id_offset[df_key]
initial_tag_adj = 1 # We always skip 1
if df_key == constants.TAG_KEY:
initial_tag_adj = 1 # We always skip 1 for tags
else:
initial_tag_adj = 0
final_start = starting_id + start_object_range + initial_tag_adj
final_end = starting_id + end_object_range
if end_object_range == -1:
Expand Down Expand Up @@ -108,19 +111,22 @@ def update_dataframes_from_schema(dataframes, schema, schema_name="", get_as_ids
dataframes(dict of str:pd.DataFrames): The updated dataframes
These dataframes can potentially have extra columns
"""
hedid_errors = []
# 1. Verify existing hed ids don't conflict between schema/dataframes
for key, df in dataframes.items():
section_key = constants.section_mapping.get(key)
for df_key, df in dataframes.items():
section_key = constants.section_mapping.get(df_key)
if not section_key:
continue
section = schema[section_key]

hedid_errors = _verify_hedid_matches(section, df)
if hedid_errors:
raise HedFileError(hedid_errors[0]['code'],
f"{len(hedid_errors)} issues found with hedId mismatches. See the .issues "
f"parameter on this exception for more details.", schema.name,
issues=hedid_errors)
unused_tag_ids = _get_hedid_range(schema_name, df_key)
hedid_errors += _verify_hedid_matches(section, df, unused_tag_ids)

if hedid_errors:
raise HedFileError(hedid_errors[0]['code'],
f"{len(hedid_errors)} issues found with hedId mismatches. See the .issues "
f"parameter on this exception for more details.", schema.name,
issues=hedid_errors)

# 2. Get the new schema as DFs
from hed.schema.schema_io.schema2df import Schema2DF # Late import as this is recursive
Expand All @@ -144,12 +150,13 @@ def update_dataframes_from_schema(dataframes, schema, schema_name="", get_as_ids
return output_dfs


def _verify_hedid_matches(section, df):
def _verify_hedid_matches(section, df, unused_tag_ids):
""" Verify ID's in both have the same label, and verify all entries in the dataframe are already in the schema

Parameters:
section(HedSchemaSection): The loaded schema section to compare ID's with
df(pd.DataFrame): The loaded spreadsheet dataframe to compare with
unused_tag_ids(set): The valid range of ID's for this df

Returns:
error_list(list of str): A list of errors found matching id's
Expand All @@ -168,6 +175,23 @@ def _verify_hedid_matches(section, df):
f"'{label}' does not exist in the schema file provided, only the spreadsheet.")
continue
entry_id = entry.attributes.get(HedKey.HedID)
if df_id:
if not (df_id.startswith("HED_") and len(df_id) == len("HED_0000000")):
hedid_errors += schema_util.format_error(row_number, row,
f"'{label}' has an improperly formatted hedID in the dataframe.")
continue
id_value = remove_prefix(df_id, "HED_")
try:
id_int = int(id_value)
if id_int not in unused_tag_ids:
hedid_errors += schema_util.format_error(row_number, row,
f"'{label}' has id {id_int} which is outside of the valid range for this type. Valid range is: {min(unused_tag_ids)} to {max(unused_tag_ids)}")
continue
except ValueError:
hedid_errors += schema_util.format_error(row_number, row,
f"'{label}' has a non-numeric hedID in the dataframe.")
continue

if entry_id and entry_id != df_id:
hedid_errors += schema_util.format_error(row_number, row,
f"'{label}' has hedID '{df_id}' in dataframe, but '{entry_id}' in schema.")
Expand Down Expand Up @@ -232,7 +256,9 @@ def convert_df_to_omn(dataframes):
dataframes(dict): A set of dataframes representing a schema, potentially including extra columns

Returns:
omn_file(str): A combined string representing (most of) a schema omn file.
tuple:
omn_file(str): A combined string representing (most of) a schema omn file.
omn_data(dict): a dict of DF_SUFFIXES:str, representing each .tsv file in omn format.
"""
from hed.schema.hed_schema_io import from_dataframes
# Load the schema, so we can save it out with ID's
Expand All @@ -243,13 +269,15 @@ def convert_df_to_omn(dataframes):
# Write out the new dataframes in omn format
annotation_props = _get_annotation_prop_ids(dataframes)
full_text = ""
omn_data = {}
for suffix, dataframe in dataframes.items():
if suffix == constants.STRUCT_KEY: # not handled here yet
continue
output_text = _convert_df_to_omn(dataframes[suffix], annotation_properties=annotation_props)
omn_data[suffix] = output_text
full_text += output_text + "\n"

return full_text
return full_text, omn_data


def _convert_df_to_omn(df, annotation_properties=("",)):
Expand Down
30 changes: 22 additions & 8 deletions tests/schema/test_ontology_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_get_library_name_and_id_unknown(self):
def test_get_hedid_range_normal_case(self):
id_set = ontology_util._get_hedid_range("score", constants.DATA_KEY)
self.assertTrue(40401 in id_set)
self.assertEqual(len(id_set), 200 - 1) # Check the range size
self.assertEqual(len(id_set), 200) # Check the range size

def test_get_hedid_range_boundary(self):
# Test boundary condition where end range is -1
Expand All @@ -55,27 +55,41 @@ def setUp(self):
self.schema_id = load_schema_version("8.3.0")

def test_no_hedid(self):
df = pd.DataFrame([{'rdfs:label': 'Event', 'hedId': '001'}, {'rdfs:label': 'Age-#', 'hedId': '002'}])
errors = _verify_hedid_matches(self.schema_82.tags, df)
df = pd.DataFrame([{'rdfs:label': 'Event', 'hedId': ''}, {'rdfs:label': 'Age-#', 'hedId': ''}])
errors = _verify_hedid_matches(self.schema_82.tags, df, ontology_util._get_hedid_range("", constants.TAG_KEY))
self.assertEqual(len(errors), 0)

def test_id_matches(self):
df = pd.DataFrame(
[{'rdfs:label': 'Event', 'hedId': 'HED_0012001'}, {'rdfs:label': 'Age-#', 'hedId': 'HED_0012475'}])
errors = _verify_hedid_matches(self.schema_id.tags, df)
errors = _verify_hedid_matches(self.schema_id.tags, df, ontology_util._get_hedid_range("", constants.TAG_KEY))
self.assertEqual(len(errors), 0)

def test_label_mismatch_id(self):
df = pd.DataFrame(
[{'rdfs:label': 'Event', 'hedId': 'invalid_id'}, {'rdfs:label': 'Age-#', 'hedId': 'invalid_id'}])
[{'rdfs:label': 'Event', 'hedId': 'HED_0012005'}, {'rdfs:label': 'Age-#', 'hedId': 'HED_0012007'}])

errors = _verify_hedid_matches(self.schema_id.tags, df)
errors = _verify_hedid_matches(self.schema_id.tags, df, ontology_util._get_hedid_range("", constants.TAG_KEY))
self.assertEqual(len(errors), 2)

def test_label_no_entry(self):
df = pd.DataFrame([{'rdfs:label': 'NotARealEvent', 'hedId': 'does_not_matter'}])

errors = _verify_hedid_matches(self.schema_id.tags, df)
errors = _verify_hedid_matches(self.schema_id.tags, df, ontology_util._get_hedid_range("", constants.TAG_KEY))
self.assertEqual(len(errors), 1)

def test_out_of_range(self):
df = pd.DataFrame([{'rdfs:label': 'Event', 'hedId': 'HED_0000000'}])

errors = _verify_hedid_matches(self.schema_82.tags, df,
ontology_util._get_hedid_range("", constants.TAG_KEY))
self.assertEqual(len(errors), 1)

def test_not_int(self):
df = pd.DataFrame([{'rdfs:label': 'Event', 'hedId': 'HED_AAAAAAA'}])

errors = _verify_hedid_matches(self.schema_82.tags, df,
ontology_util._get_hedid_range("", constants.TAG_KEY))
self.assertEqual(len(errors), 1)

def test_get_all_ids_exists(self):
Expand Down Expand Up @@ -143,7 +157,7 @@ def test_update_dataframes_from_schema(self):
class TestConvertOmn(unittest.TestCase):
def test_convert_df_to_omn(self):
dataframes = load_schema_version("8.3.0").get_as_dataframes()
omn_version = convert_df_to_omn(dataframes)
omn_version, _ = convert_df_to_omn(dataframes)

# make these more robust, for now just verify it's somewhere in the result
for df_name, df in dataframes.items():
Expand Down
Loading