Skip to content

Commit

Permalink
Merge pull request #922 from IanCa/develop
Browse files Browse the repository at this point in the history
Add more robust hedID validation, minor ontology code tweaks
  • Loading branch information
VisLab committed May 9, 2024
2 parents 6bb4d57 + ce87684 commit f2e340a
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 20 deletions.
52 changes: 40 additions & 12 deletions hed/schema/schema_io/ontology_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,10 @@ def _get_hedid_range(schema_name, df_key):
starting_id, ending_id = library_index_ranges[schema_name]

start_object_range, end_object_range = object_type_id_offset[df_key]
initial_tag_adj = 1 # We always skip 1
if df_key == constants.TAG_KEY:
initial_tag_adj = 1 # We always skip 1 for tags
else:
initial_tag_adj = 0
final_start = starting_id + start_object_range + initial_tag_adj
final_end = starting_id + end_object_range
if end_object_range == -1:
Expand Down Expand Up @@ -108,19 +111,22 @@ def update_dataframes_from_schema(dataframes, schema, schema_name="", get_as_ids
dataframes(dict of str:pd.DataFrames): The updated dataframes
These dataframes can potentially have extra columns
"""
hedid_errors = []
# 1. Verify existing hed ids don't conflict between schema/dataframes
for key, df in dataframes.items():
section_key = constants.section_mapping.get(key)
for df_key, df in dataframes.items():
section_key = constants.section_mapping.get(df_key)
if not section_key:
continue
section = schema[section_key]

hedid_errors = _verify_hedid_matches(section, df)
if hedid_errors:
raise HedFileError(hedid_errors[0]['code'],
f"{len(hedid_errors)} issues found with hedId mismatches. See the .issues "
f"parameter on this exception for more details.", schema.name,
issues=hedid_errors)
unused_tag_ids = _get_hedid_range(schema_name, df_key)
hedid_errors += _verify_hedid_matches(section, df, unused_tag_ids)

if hedid_errors:
raise HedFileError(hedid_errors[0]['code'],
f"{len(hedid_errors)} issues found with hedId mismatches. See the .issues "
f"parameter on this exception for more details.", schema.name,
issues=hedid_errors)

# 2. Get the new schema as DFs
from hed.schema.schema_io.schema2df import Schema2DF # Late import as this is recursive
Expand All @@ -144,12 +150,13 @@ def update_dataframes_from_schema(dataframes, schema, schema_name="", get_as_ids
return output_dfs


def _verify_hedid_matches(section, df):
def _verify_hedid_matches(section, df, unused_tag_ids):
""" Verify ID's in both have the same label, and verify all entries in the dataframe are already in the schema
Parameters:
section(HedSchemaSection): The loaded schema section to compare ID's with
df(pd.DataFrame): The loaded spreadsheet dataframe to compare with
unused_tag_ids(set): The valid range of ID's for this df
Returns:
error_list(list of str): A list of errors found matching id's
Expand All @@ -168,6 +175,23 @@ def _verify_hedid_matches(section, df):
f"'{label}' does not exist in the schema file provided, only the spreadsheet.")
continue
entry_id = entry.attributes.get(HedKey.HedID)
if df_id:
if not (df_id.startswith("HED_") and len(df_id) == len("HED_0000000")):
hedid_errors += schema_util.format_error(row_number, row,
f"'{label}' has an improperly formatted hedID in the dataframe.")
continue
id_value = remove_prefix(df_id, "HED_")
try:
id_int = int(id_value)
if id_int not in unused_tag_ids:
hedid_errors += schema_util.format_error(row_number, row,
f"'{label}' has id {id_int} which is outside of the valid range for this type. Valid range is: {min(unused_tag_ids)} to {max(unused_tag_ids)}")
continue
except ValueError:
hedid_errors += schema_util.format_error(row_number, row,
f"'{label}' has a non-numeric hedID in the dataframe.")
continue

if entry_id and entry_id != df_id:
hedid_errors += schema_util.format_error(row_number, row,
f"'{label}' has hedID '{df_id}' in dataframe, but '{entry_id}' in schema.")
Expand Down Expand Up @@ -232,7 +256,9 @@ def convert_df_to_omn(dataframes):
dataframes(dict): A set of dataframes representing a schema, potentially including extra columns
Returns:
omn_file(str): A combined string representing (most of) a schema omn file.
tuple:
omn_file(str): A combined string representing (most of) a schema omn file.
omn_data(dict): a dict of DF_SUFFIXES:str, representing each .tsv file in omn format.
"""
from hed.schema.hed_schema_io import from_dataframes
# Load the schema, so we can save it out with ID's
Expand All @@ -243,13 +269,15 @@ def convert_df_to_omn(dataframes):
# Write out the new dataframes in omn format
annotation_props = _get_annotation_prop_ids(dataframes)
full_text = ""
omn_data = {}
for suffix, dataframe in dataframes.items():
if suffix == constants.STRUCT_KEY: # not handled here yet
continue
output_text = _convert_df_to_omn(dataframes[suffix], annotation_properties=annotation_props)
omn_data[suffix] = output_text
full_text += output_text + "\n"

return full_text
return full_text, omn_data


def _convert_df_to_omn(df, annotation_properties=("",)):
Expand Down
30 changes: 22 additions & 8 deletions tests/schema/test_ontology_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_get_library_name_and_id_unknown(self):
def test_get_hedid_range_normal_case(self):
id_set = ontology_util._get_hedid_range("score", constants.DATA_KEY)
self.assertTrue(40401 in id_set)
self.assertEqual(len(id_set), 200 - 1) # Check the range size
self.assertEqual(len(id_set), 200) # Check the range size

def test_get_hedid_range_boundary(self):
# Test boundary condition where end range is -1
Expand All @@ -55,27 +55,41 @@ def setUp(self):
self.schema_id = load_schema_version("8.3.0")

def test_no_hedid(self):
df = pd.DataFrame([{'rdfs:label': 'Event', 'hedId': '001'}, {'rdfs:label': 'Age-#', 'hedId': '002'}])
errors = _verify_hedid_matches(self.schema_82.tags, df)
df = pd.DataFrame([{'rdfs:label': 'Event', 'hedId': ''}, {'rdfs:label': 'Age-#', 'hedId': ''}])
errors = _verify_hedid_matches(self.schema_82.tags, df, ontology_util._get_hedid_range("", constants.TAG_KEY))
self.assertEqual(len(errors), 0)

def test_id_matches(self):
df = pd.DataFrame(
[{'rdfs:label': 'Event', 'hedId': 'HED_0012001'}, {'rdfs:label': 'Age-#', 'hedId': 'HED_0012475'}])
errors = _verify_hedid_matches(self.schema_id.tags, df)
errors = _verify_hedid_matches(self.schema_id.tags, df, ontology_util._get_hedid_range("", constants.TAG_KEY))
self.assertEqual(len(errors), 0)

def test_label_mismatch_id(self):
df = pd.DataFrame(
[{'rdfs:label': 'Event', 'hedId': 'invalid_id'}, {'rdfs:label': 'Age-#', 'hedId': 'invalid_id'}])
[{'rdfs:label': 'Event', 'hedId': 'HED_0012005'}, {'rdfs:label': 'Age-#', 'hedId': 'HED_0012007'}])

errors = _verify_hedid_matches(self.schema_id.tags, df)
errors = _verify_hedid_matches(self.schema_id.tags, df, ontology_util._get_hedid_range("", constants.TAG_KEY))
self.assertEqual(len(errors), 2)

def test_label_no_entry(self):
df = pd.DataFrame([{'rdfs:label': 'NotARealEvent', 'hedId': 'does_not_matter'}])

errors = _verify_hedid_matches(self.schema_id.tags, df)
errors = _verify_hedid_matches(self.schema_id.tags, df, ontology_util._get_hedid_range("", constants.TAG_KEY))
self.assertEqual(len(errors), 1)

def test_out_of_range(self):
df = pd.DataFrame([{'rdfs:label': 'Event', 'hedId': 'HED_0000000'}])

errors = _verify_hedid_matches(self.schema_82.tags, df,
ontology_util._get_hedid_range("", constants.TAG_KEY))
self.assertEqual(len(errors), 1)

def test_not_int(self):
df = pd.DataFrame([{'rdfs:label': 'Event', 'hedId': 'HED_AAAAAAA'}])

errors = _verify_hedid_matches(self.schema_82.tags, df,
ontology_util._get_hedid_range("", constants.TAG_KEY))
self.assertEqual(len(errors), 1)

def test_get_all_ids_exists(self):
Expand Down Expand Up @@ -143,7 +157,7 @@ def test_update_dataframes_from_schema(self):
class TestConvertOmn(unittest.TestCase):
def test_convert_df_to_omn(self):
dataframes = load_schema_version("8.3.0").get_as_dataframes()
omn_version = convert_df_to_omn(dataframes)
omn_version, _ = convert_df_to_omn(dataframes)

# make these more robust, for now just verify it's somewhere in the result
for df_name, df in dataframes.items():
Expand Down

0 comments on commit f2e340a

Please sign in to comment.