Skip to content

Commit

Permalink
Merge pull request #660 from IanCa/dev3
Browse files Browse the repository at this point in the history
Add sidecar brace support
  • Loading branch information
VisLab authored May 9, 2023
2 parents 35f4b6e + 4fa1ff0 commit 4fd9541
Show file tree
Hide file tree
Showing 29 changed files with 709 additions and 511 deletions.
3 changes: 2 additions & 1 deletion hed/errors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .error_reporter import ErrorHandler, get_printable_issue_string, sort_issues
from .error_types import DefinitionErrors, OnsetErrors, SchemaErrors, SchemaWarnings, SidecarErrors, ValidationErrors
from .error_types import DefinitionErrors, OnsetErrors, SchemaErrors, SchemaWarnings, SidecarErrors, \
ValidationErrors, ColumnErrors
from .error_types import ErrorContext, ErrorSeverity
from .exceptions import HedExceptions, HedFileError
12 changes: 6 additions & 6 deletions hed/errors/error_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,23 +401,23 @@ def onset_wrong_placeholder(tag, has_placeholder):
return f"Onset/offset def tag {tag} should not have a placeholder, but has one."


@hed_error(ColumnErrors.INVALID_COLUMN_REF)
def invalid_column_ref(bad_refs):
return f"Bad column references found(columns do not exist): {bad_refs}"
@hed_error(ColumnErrors.INVALID_COLUMN_REF, actual_code=SidecarErrors.SIDECAR_BRACES_INVALID)
def invalid_column_ref(bad_ref):
return f"The column '{bad_ref}' is unknown.'"


@hed_error(ColumnErrors.SELF_COLUMN_REF)
@hed_error(ColumnErrors.SELF_COLUMN_REF, actual_code=SidecarErrors.SIDECAR_BRACES_INVALID)
def self_column_ref(self_ref):
return f"Column references itself: {self_ref}"


@hed_error(ColumnErrors.NESTED_COLUMN_REF)
@hed_error(ColumnErrors.NESTED_COLUMN_REF, actual_code=SidecarErrors.SIDECAR_BRACES_INVALID)
def nested_column_ref(column_name, ref_column):
return f"Column {column_name} has a nested reference to {ref_column}. " \
f"Column reference columns cannot contain other column references."


@hed_error(ColumnErrors.MALFORMED_COLUMN_REF)
@hed_error(ColumnErrors.MALFORMED_COLUMN_REF, actual_code=SidecarErrors.SIDECAR_BRACES_INVALID)
def nested_column_ref(column_name, index, symbol):
return f"Column {column_name} has a malformed column reference. Improper symbol {symbol} found at index {index}."

Expand Down
3 changes: 1 addition & 2 deletions hed/errors/error_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,10 +396,9 @@ def val_error_unknown(*args, **kwargs):
Returns:
str: The error message.
dict: The extra args.
"""
return f"Unknown error. Args: {str(args)}", kwargs
return f"Unknown error. Args: {str(args), str(kwargs)}"

@staticmethod
def filter_issues_by_severity(issues_list, severity):
Expand Down
1 change: 1 addition & 0 deletions hed/errors/error_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ class SidecarErrors:
SIDECAR_HED_USED_COLUMN = 'SIDECAR_HED_USED_COLUMN'
SIDECAR_NA_USED = 'SIDECAR_NA_USED'
SIDECAR_HED_USED = 'SIDECAR_HED_USED'
SIDECAR_BRACES_INVALID = "SIDECAR_BRACES_INVALID"


class SchemaErrors:
Expand Down
117 changes: 75 additions & 42 deletions hed/models/base_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,9 +251,9 @@ def columns(self):
Empty if no column names.
Returns:
columns(dict): The column number:name pairs
columns(list): the column names
"""
columns = {}
columns = []
if self._dataframe is not None and self._has_column_names:
columns = list(self._dataframe.columns)
return columns
Expand Down Expand Up @@ -354,24 +354,25 @@ def _dataframe_has_names(dataframe):
return True
return False

def assemble(self, mapper=None, skip_square_brackets=False):
def assemble(self, mapper=None, skip_curly_braces=False):
""" Assembles the hed strings
Parameters:
mapper(ColumnMapper or None): Generally pass none here unless you want special behavior.
skip_square_brackets (bool): If True, don't plug in square bracket values into columns.
skip_curly_braces (bool): If True, don't plug in curly brace values into columns.
Returns:
Dataframe: the assembled dataframe
"""
if mapper is None:
mapper = self._mapper

all_columns = self._handle_transforms(mapper)
if skip_square_brackets:
if skip_curly_braces:
return all_columns
transformers, _ = mapper.get_transformers()

return self._handle_square_brackets(all_columns, list(transformers))
refs = self.get_column_refs()
column_names = list(transformers)
return self._handle_curly_braces_refs(all_columns, refs, column_names)

def _handle_transforms(self, mapper):
transformers, need_categorical = mapper.get_transformers()
Expand All @@ -390,45 +391,67 @@ def _handle_transforms(self, mapper):
return all_columns

@staticmethod
def _find_column_refs(df, column_names):
found_column_references = []
for column_name in column_names:
df_temp = df[column_name].str.findall("\[([a-z_\-0-9]+)\]", re.IGNORECASE)
u_vals = pd.Series([j for i in df_temp if isinstance(i, list) for j in i], dtype=str)
u_vals = u_vals.unique()
for val in u_vals:
if val not in found_column_references:
found_column_references.append(val)

return found_column_references
def _replace_ref(text, newvalue, column_ref):
""" Replace column ref in x with y. If it's n/a, delete extra commas/parentheses.
@staticmethod
def _handle_square_brackets(df, known_columns=None):
Note: This function could easily be updated to handle non-curly brace values, but it's faster this way.
Parameters:
text (str): The input string containing the ref enclosed in curly braces.
newvalue (str): The replacement value for the ref.
column_ref (str): The ref to be replaced, without curly braces
Returns:
str: The modified string with the ref replaced or removed.
"""
Plug in square brackets with other columns
# If it's not n/a, we can just replace directly.
if newvalue != "n/a":
return text.replace(f"{{{column_ref}}}", newvalue)

def _remover(match):
p1 = match.group("p1").count("(")
p2 = match.group("p2").count(")")
if p1 > p2: # We have more starting parens than ending. Make sure we don't remove comma before
output = match.group("c1") + "(" * (p1 - p2)
elif p2 > p1: # We have more ending parens. Make sure we don't remove comma after
output = ")" * (p2 - p1) + match.group("c2")
else:
c1 = match.group("c1")
c2 = match.group("c2")
if c1:
c1 = ""
elif c2:
c2 = ""
output = c1 + c2

return output

# this finds all surrounding commas and parentheses to a reference.
# c1/c2 contain the comma(and possibly spaces) separating this ref from other tags
# p1/p2 contain the parentheses directly surrounding the tag
# All four groups can have spaces.
pattern = r'(?P<c1>[\s,]*)(?P<p1>[(\s]*)\{' + column_ref + r'\}(?P<p2>[\s)]*)(?P<c2>[\s,]*)'
return re.sub(pattern, _remover, text)

If known columns is passed, only use those columns to find or replace references.
@staticmethod
def _handle_curly_braces_refs(df, refs, column_names):
"""
if known_columns is not None:
column_names = list(known_columns)
else:
column_names = list(df.columns)
possible_column_references = [f"{column_name}" for column_name in column_names if
isinstance(column_name, str) and column_name.lower() != "hed"]
found_column_references = BaseInput._find_column_refs(df, column_names)

valid_replacements = [col for col in found_column_references if col in possible_column_references]

# todo: break this into a sub function(probably)
for column_name in valid_replacements:
column_names.remove(column_name)
saved_columns = df[valid_replacements]
for column_name in column_names:
for replacing_name in valid_replacements:
column_name_brackets = f"[{replacing_name}]"
df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y
Plug in curly braces with other columns
"""
# Filter out columns and refs that don't exist.
refs = [ref for ref in refs if ref in column_names]
remaining_columns = [column for column in column_names if column not in refs]

# Replace references in the columns we are saving out.
saved_columns = df[refs]
for column_name in remaining_columns:
for replacing_name in refs:
# If the data has no n/a values, this version is MUCH faster.
# column_name_brackets = f"{{{replacing_name}}}"
# df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y
# in zip(df[column_name], saved_columns[replacing_name]))
df[column_name] = pd.Series(BaseInput._replace_ref(x, y, replacing_name) for x, y
in zip(df[column_name], saved_columns[replacing_name]))
df = df[column_names]
df = df[remaining_columns]

return df

Expand Down Expand Up @@ -462,4 +485,14 @@ def get_def_dict(self, hed_schema=None, extra_def_dicts=None):
DefinitionDict: A single definition dict representing all the data(and extra def dicts)
"""
from hed.models.definition_dict import DefinitionDict
return DefinitionDict(extra_def_dicts, hed_schema)
return DefinitionDict(extra_def_dicts, hed_schema)

def get_column_refs(self):
""" Returns a list of column refs for this file.
Default implementation returns none.
Returns:
column_refs(list): A list of unique column refs found
"""
return []
6 changes: 3 additions & 3 deletions hed/models/column_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,12 @@ def get_transformers(self):
if column.column_type == ColumnType.Ignore:
continue
elif column.column_type == ColumnType.Value:
value_str = column._hed_dict
value_str = column.hed_dict
from functools import partial
final_transformers[assign_to_column] = partial(self._value_handler, value_str)
elif column.column_type == ColumnType.Categorical:
need_categorical.append(column.column_name)
category_values = column._hed_dict
category_values = column.hed_dict
from functools import partial
final_transformers[assign_to_column] = partial(self._category_handler, category_values)
else:
Expand Down Expand Up @@ -243,7 +243,7 @@ def _add_value_columns(self, column_prefix_dictionary):
prefix = prefix + "#"
else:
prefix = prefix + "/#"
new_def = ColumnMetadata(ColumnType.Value, col, hed_dict=prefix)
new_def = ColumnMetadata(ColumnType.Value, col, source=prefix)
self._add_column_data(new_def)

def _add_column_data(self, new_column_entry):
Expand Down
96 changes: 79 additions & 17 deletions hed/models/column_metadata.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from enum import Enum
from hed.errors.error_types import SidecarErrors
import pandas as pd


class ColumnType(Enum):
Expand All @@ -21,30 +22,20 @@ class ColumnType(Enum):
class ColumnMetadata:
""" Column in a ColumnMapper. """

def __init__(self, column_type=None, name=None, hed_dict=None, column_prefix=None):
def __init__(self, column_type=None, name=None, source=None):
""" A single column entry in the column mapper.
Parameters:
column_type (ColumnType or None): How to treat this column when reading data.
name (str, int, or None): The column_name or column number identifying this column.
If name is a string, you'll need to use a column map to set the number later.
hed_dict (dict or str or None): The loaded data (usually from json) for the given def
For category columns, this is a dict.
For value columns, it's a string.
column_prefix (str or None): If present, prepend the given column_prefix to all hed tags in the columns.
Only works on ColumnType HedTags.
Notes:
- Each column from which data is retrieved must have a ColumnMetadata representing its contents.
- The column_prefix dictionaries are used when the column is processed.
source (dict or str or None): Either the entire loaded json sidecar or a single HED string
"""
if hed_dict is None:
hed_dict = {}

self.column_type = column_type
self.column_name = name
self.column_prefix = column_prefix
self._hed_dict = hed_dict
self._source = source
if column_type is None:
column_type = self._detect_column_type(self.source_dict)
self.column_type = column_type

@property
def hed_dict(self):
Expand All @@ -54,7 +45,78 @@ def hed_dict(self):
dict or str: A string or dict of strings for this column
"""
return self._hed_dict
if self._source is None or isinstance(self._source, str):
return self._source
return self._source[self.column_name].get("HED", {})

@property
def source_dict(self):
""" The raw dict for this entry(if it exists)
Returns:
dict or str: A string or dict of strings for this column
"""
if self._source is None or isinstance(self._source, str):
return {"HED": self._source}
return self._source[self.column_name]

def get_hed_strings(self):
if not self.column_type:
return pd.Series(dtype=str)

series = pd.Series(self.hed_dict, dtype=str)

return series

def set_hed_strings(self, new_strings):
if new_strings is None:
return False

if not self.column_type:
return False

if isinstance(new_strings, pd.Series):
if self.column_type == ColumnType.Categorical:
new_strings = new_strings.to_dict()
else:
new_strings = new_strings.iloc[0]

self._source[self.column_name]["HED"] = new_strings

return True

@staticmethod
def _detect_column_type(dict_for_entry):
""" Determine the ColumnType of a given json entry.
Parameters:
dict_for_entry (dict): The loaded json entry a specific column.
Generally has a "HED" entry among other optional ones.
Returns:
ColumnType: The determined type of given column. Returns None if unknown.
"""
if not dict_for_entry or not isinstance(dict_for_entry, dict):
return ColumnType.Ignore

minimum_required_keys = ("HED",)
if not set(minimum_required_keys).issubset(dict_for_entry.keys()):
return ColumnType.Ignore

hed_entry = dict_for_entry["HED"]
if isinstance(hed_entry, dict):
if not all(isinstance(entry, str) for entry in hed_entry.values()):
return None
return ColumnType.Categorical

if not isinstance(hed_entry, str):
return None

if "#" not in dict_for_entry["HED"]:
return None

return ColumnType.Value

@staticmethod
def expected_pound_sign_count(column_type):
Expand Down
3 changes: 0 additions & 3 deletions hed/models/df_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,17 +126,14 @@ def expand_defs(df, hed_schema, def_dict, columns=None):


def _convert_to_form(hed_string, hed_schema, tag_form):
from hed import HedString
return str(HedString(hed_string, hed_schema).get_as_form(tag_form))


def _shrink_defs(hed_string, hed_schema):
from hed import HedString
return str(HedString(hed_string, hed_schema).shrink_defs())


def _expand_defs(hed_string, hed_schema, def_dict):
from hed import HedString
return str(HedString(hed_string, hed_schema, def_dict).expand_defs())


Expand Down
Loading

0 comments on commit 4fd9541

Please sign in to comment.