Merge pull request #660 from IanCa/dev3

Add sidecar brace support
hed-standard · May 9, 2023 · 4fd9541 · 4fd9541
2 parents 35f4b6e + 4fa1ff0
commit 4fd9541
Show file tree

Hide file tree

Showing 29 changed files with 709 additions and 511 deletions.
diff --git a/hed/errors/__init__.py b/hed/errors/__init__.py
@@ -1,4 +1,5 @@
 from .error_reporter import ErrorHandler, get_printable_issue_string, sort_issues
-from .error_types import DefinitionErrors, OnsetErrors, SchemaErrors, SchemaWarnings,  SidecarErrors, ValidationErrors
+from .error_types import DefinitionErrors, OnsetErrors, SchemaErrors, SchemaWarnings,  SidecarErrors, \
+    ValidationErrors, ColumnErrors
 from .error_types import ErrorContext, ErrorSeverity
 from .exceptions import HedExceptions, HedFileError
diff --git a/hed/errors/error_messages.py b/hed/errors/error_messages.py
@@ -401,23 +401,23 @@ def onset_wrong_placeholder(tag, has_placeholder):
     return f"Onset/offset def tag {tag} should not have a placeholder, but has one."
 
 
-@hed_error(ColumnErrors.INVALID_COLUMN_REF)
-def invalid_column_ref(bad_refs):
-    return f"Bad column references found(columns do not exist): {bad_refs}"
+@hed_error(ColumnErrors.INVALID_COLUMN_REF, actual_code=SidecarErrors.SIDECAR_BRACES_INVALID)
+def invalid_column_ref(bad_ref):
+    return f"The column '{bad_ref}' is unknown.'"
 
 
-@hed_error(ColumnErrors.SELF_COLUMN_REF)
+@hed_error(ColumnErrors.SELF_COLUMN_REF, actual_code=SidecarErrors.SIDECAR_BRACES_INVALID)
 def self_column_ref(self_ref):
     return f"Column references itself: {self_ref}"
 
 
-@hed_error(ColumnErrors.NESTED_COLUMN_REF)
+@hed_error(ColumnErrors.NESTED_COLUMN_REF, actual_code=SidecarErrors.SIDECAR_BRACES_INVALID)
 def nested_column_ref(column_name, ref_column):
     return f"Column {column_name} has a nested reference to {ref_column}.  " \
            f"Column reference columns cannot contain other column references."
 
 
-@hed_error(ColumnErrors.MALFORMED_COLUMN_REF)
+@hed_error(ColumnErrors.MALFORMED_COLUMN_REF, actual_code=SidecarErrors.SIDECAR_BRACES_INVALID)
 def nested_column_ref(column_name, index, symbol):
     return f"Column {column_name} has a malformed column reference.  Improper symbol {symbol} found at index {index}."
 

diff --git a/hed/errors/error_reporter.py b/hed/errors/error_reporter.py
@@ -396,10 +396,9 @@ def val_error_unknown(*args, **kwargs):
 
         Returns:
             str: The error message.
-            dict: The extra args.
 
         """
-        return f"Unknown error.  Args: {str(args)}", kwargs
+        return f"Unknown error.  Args: {str(args), str(kwargs)}"
 
     @staticmethod
     def filter_issues_by_severity(issues_list, severity):

diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py
@@ -99,6 +99,7 @@ class SidecarErrors:
     SIDECAR_HED_USED_COLUMN = 'SIDECAR_HED_USED_COLUMN'
     SIDECAR_NA_USED = 'SIDECAR_NA_USED'
     SIDECAR_HED_USED = 'SIDECAR_HED_USED'
+    SIDECAR_BRACES_INVALID = "SIDECAR_BRACES_INVALID"
 
 
 class SchemaErrors:

diff --git a/hed/models/base_input.py b/hed/models/base_input.py
@@ -251,9 +251,9 @@ def columns(self):
             Empty if no column names.
 
         Returns:
-            columns(dict): The column number:name pairs
+            columns(list): the column names
         """
-        columns = {}
+        columns = []
         if self._dataframe is not None and self._has_column_names:
             columns = list(self._dataframe.columns)
         return columns
@@ -354,24 +354,25 @@ def _dataframe_has_names(dataframe):
                 return True
         return False
 
-    def assemble(self, mapper=None, skip_square_brackets=False):
+    def assemble(self, mapper=None, skip_curly_braces=False):
         """ Assembles the hed strings
 
         Parameters:
             mapper(ColumnMapper or None): Generally pass none here unless you want special behavior.
-            skip_square_brackets (bool): If True, don't plug in square bracket values into columns.
+            skip_curly_braces (bool): If True, don't plug in curly brace values into columns.
         Returns:
             Dataframe: the assembled dataframe
         """
         if mapper is None:
             mapper = self._mapper
 
         all_columns = self._handle_transforms(mapper)
-        if skip_square_brackets:
+        if skip_curly_braces:
             return all_columns
         transformers, _ = mapper.get_transformers()
-
-        return self._handle_square_brackets(all_columns, list(transformers))
+        refs = self.get_column_refs()
+        column_names = list(transformers)
+        return self._handle_curly_braces_refs(all_columns, refs, column_names)
 
     def _handle_transforms(self, mapper):
         transformers, need_categorical = mapper.get_transformers()
@@ -390,45 +391,67 @@ def _handle_transforms(self, mapper):
         return all_columns
 
     @staticmethod
-    def _find_column_refs(df, column_names):
-        found_column_references = []
-        for column_name in column_names:
-            df_temp = df[column_name].str.findall("\[([a-z_\-0-9]+)\]", re.IGNORECASE)
-            u_vals = pd.Series([j for i in df_temp if isinstance(i, list) for j in i], dtype=str)
-            u_vals = u_vals.unique()
-            for val in u_vals:
-                if val not in found_column_references:
-                    found_column_references.append(val)
-
-        return found_column_references
+    def _replace_ref(text, newvalue, column_ref):
+        """ Replace column ref in x with y.  If it's n/a, delete extra commas/parentheses.
 
-    @staticmethod
-    def _handle_square_brackets(df, known_columns=None):
+        Note: This function could easily be updated to handle non-curly brace values, but it's faster this way.
+        Parameters:
+            text (str): The input string containing the ref enclosed in curly braces.
+            newvalue (str): The replacement value for the ref.
+            column_ref (str): The ref to be replaced, without curly braces
+
+        Returns:
+            str: The modified string with the ref replaced or removed.
         """
-            Plug in square brackets with other columns
+        # If it's not n/a, we can just replace directly.
+        if newvalue != "n/a":
+            return text.replace(f"{{{column_ref}}}", newvalue)
+
+        def _remover(match):
+            p1 = match.group("p1").count("(")
+            p2 = match.group("p2").count(")")
+            if p1 > p2:  # We have more starting parens than ending.  Make sure we don't remove comma before
+                output = match.group("c1") + "(" * (p1 - p2)
+            elif p2 > p1:  # We have more ending parens.  Make sure we don't remove comma after
+                output = ")" * (p2 - p1) + match.group("c2")
+            else:
+                c1 = match.group("c1")
+                c2 = match.group("c2")
+                if c1:
+                    c1 = ""
+                elif c2:
+                    c2 = ""
+                output = c1 + c2
+
+            return output
+
+        # this finds all surrounding commas and parentheses to a reference.
+        # c1/c2 contain the comma(and possibly spaces) separating this ref from other tags
+        # p1/p2 contain the parentheses directly surrounding the tag
+        # All four groups can have spaces.
+        pattern = r'(?P<c1>[\s,]*)(?P<p1>[(\s]*)\{' + column_ref + r'\}(?P<p2>[\s)]*)(?P<c2>[\s,]*)'
+        return re.sub(pattern, _remover, text)
 
-            If known columns is passed, only use those columns to find or replace references.
+    @staticmethod
+    def _handle_curly_braces_refs(df, refs, column_names):
         """
-        if known_columns is not None:
-            column_names = list(known_columns)
-        else:
-            column_names = list(df.columns)
-        possible_column_references = [f"{column_name}" for column_name in column_names if
-                                      isinstance(column_name, str) and column_name.lower() != "hed"]
-        found_column_references = BaseInput._find_column_refs(df, column_names)
-
-        valid_replacements = [col for col in found_column_references if col in possible_column_references]
-
-        # todo: break this into a sub function(probably)
-        for column_name in valid_replacements:
-            column_names.remove(column_name)
-        saved_columns = df[valid_replacements]
-        for column_name in column_names:
-            for replacing_name in valid_replacements:
-                column_name_brackets = f"[{replacing_name}]"
-                df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y
+            Plug in curly braces with other columns
+        """
+        # Filter out columns and refs that don't exist.
+        refs = [ref for ref in refs if ref in column_names]
+        remaining_columns = [column for column in column_names if column not in refs]
+
+        # Replace references in the columns we are saving out.
+        saved_columns = df[refs]
+        for column_name in remaining_columns:
+            for replacing_name in refs:
+                # If the data has no n/a values, this version is MUCH faster.
+                # column_name_brackets = f"{{{replacing_name}}}"
+                # df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y
+                #                             in zip(df[column_name], saved_columns[replacing_name]))
+                df[column_name] = pd.Series(BaseInput._replace_ref(x, y, replacing_name) for x, y
                                             in zip(df[column_name], saved_columns[replacing_name]))
-        df = df[column_names]
+        df = df[remaining_columns]
 
         return df
 
@@ -462,4 +485,14 @@ def get_def_dict(self, hed_schema=None, extra_def_dicts=None):
             DefinitionDict:   A single definition dict representing all the data(and extra def dicts)
         """
         from hed.models.definition_dict import DefinitionDict
-        return DefinitionDict(extra_def_dicts, hed_schema)
+        return DefinitionDict(extra_def_dicts, hed_schema)
+
+    def get_column_refs(self):
+        """ Returns a list of column refs for this file.
+
+            Default implementation returns none.
+
+        Returns:
+            column_refs(list): A list of unique column refs found
+        """
+        return []
diff --git a/hed/models/column_mapper.py b/hed/models/column_mapper.py
@@ -85,12 +85,12 @@ def get_transformers(self):
             if column.column_type == ColumnType.Ignore:
                 continue
             elif column.column_type == ColumnType.Value:
-                value_str = column._hed_dict
+                value_str = column.hed_dict
                 from functools import partial
                 final_transformers[assign_to_column] = partial(self._value_handler, value_str)
             elif column.column_type == ColumnType.Categorical:
                 need_categorical.append(column.column_name)
-                category_values = column._hed_dict
+                category_values = column.hed_dict
                 from functools import partial
                 final_transformers[assign_to_column] = partial(self._category_handler, category_values)
             else:
@@ -243,7 +243,7 @@ def _add_value_columns(self, column_prefix_dictionary):
                     prefix = prefix + "#"
                 else:
                     prefix = prefix + "/#"
-                new_def = ColumnMetadata(ColumnType.Value, col, hed_dict=prefix)
+                new_def = ColumnMetadata(ColumnType.Value, col, source=prefix)
                 self._add_column_data(new_def)
 
     def _add_column_data(self, new_column_entry):

diff --git a/hed/models/column_metadata.py b/hed/models/column_metadata.py
@@ -1,5 +1,6 @@
 from enum import Enum
 from hed.errors.error_types import SidecarErrors
+import pandas as pd
 
 
 class ColumnType(Enum):
@@ -21,30 +22,20 @@ class ColumnType(Enum):
 class ColumnMetadata:
     """ Column in a ColumnMapper. """
 
-    def __init__(self, column_type=None, name=None, hed_dict=None, column_prefix=None):
+    def __init__(self, column_type=None, name=None, source=None):
         """ A single column entry in the column mapper.
 
         Parameters:
             column_type (ColumnType or None): How to treat this column when reading data.
             name (str, int, or None): The column_name or column number identifying this column.
                 If name is a string, you'll need to use a column map to set the number later.
-            hed_dict (dict or str or None): The loaded data (usually from json) for the given def
-                                     For category columns, this is a dict.
-                                     For value columns, it's a string.
-            column_prefix (str or None): If present, prepend the given column_prefix to all hed tags in the columns.
-                Only works on ColumnType HedTags.
-
-        Notes:
-            - Each column from which data is retrieved must have a ColumnMetadata representing its contents.
-            - The column_prefix dictionaries are used when the column is processed.
+            source (dict or str or None): Either the entire loaded json sidecar or a single HED string
         """
-        if hed_dict is None:
-            hed_dict = {}
-
-        self.column_type = column_type
         self.column_name = name
-        self.column_prefix = column_prefix
-        self._hed_dict = hed_dict
+        self._source = source
+        if column_type is None:
+            column_type = self._detect_column_type(self.source_dict)
+        self.column_type = column_type
 
     @property
     def hed_dict(self):
@@ -54,7 +45,78 @@ def hed_dict(self):
             dict or str: A string or dict of strings for this column
 
         """
-        return self._hed_dict
+        if self._source is None or isinstance(self._source, str):
+            return self._source
+        return self._source[self.column_name].get("HED", {})
+
+    @property
+    def source_dict(self):
+        """ The raw dict for this entry(if it exists)
+
+        Returns:
+            dict or str: A string or dict of strings for this column
+        """
+        if self._source is None or isinstance(self._source, str):
+            return {"HED": self._source}
+        return self._source[self.column_name]
+
+    def get_hed_strings(self):
+        if not self.column_type:
+            return pd.Series(dtype=str)
+
+        series = pd.Series(self.hed_dict, dtype=str)
+
+        return series
+
+    def set_hed_strings(self, new_strings):
+        if new_strings is None:
+            return False
+
+        if not self.column_type:
+            return False
+
+        if isinstance(new_strings, pd.Series):
+            if self.column_type == ColumnType.Categorical:
+                new_strings = new_strings.to_dict()
+            else:
+                new_strings = new_strings.iloc[0]
+
+        self._source[self.column_name]["HED"] = new_strings
+
+        return True
+
+    @staticmethod
+    def _detect_column_type(dict_for_entry):
+        """ Determine the ColumnType of a given json entry.
+
+        Parameters:
+            dict_for_entry (dict): The loaded json entry a specific column.
+                Generally has a "HED" entry among other optional ones.
+
+        Returns:
+            ColumnType: The determined type of given column.  Returns None if unknown.
+
+        """
+        if not dict_for_entry or not isinstance(dict_for_entry, dict):
+            return ColumnType.Ignore
+
+        minimum_required_keys = ("HED",)
+        if not set(minimum_required_keys).issubset(dict_for_entry.keys()):
+            return ColumnType.Ignore
+
+        hed_entry = dict_for_entry["HED"]
+        if isinstance(hed_entry, dict):
+            if not all(isinstance(entry, str) for entry in hed_entry.values()):
+                return None
+            return ColumnType.Categorical
+
+        if not isinstance(hed_entry, str):
+            return None
+
+        if "#" not in dict_for_entry["HED"]:
+            return None
+
+        return ColumnType.Value
 
     @staticmethod
     def expected_pound_sign_count(column_type):

diff --git a/hed/models/df_util.py b/hed/models/df_util.py
@@ -126,17 +126,14 @@ def expand_defs(df, hed_schema, def_dict, columns=None):
 
 
 def _convert_to_form(hed_string, hed_schema, tag_form):
-    from hed import HedString
     return str(HedString(hed_string, hed_schema).get_as_form(tag_form))
 
 
 def _shrink_defs(hed_string, hed_schema):
-    from hed import HedString
     return str(HedString(hed_string, hed_schema).shrink_defs())
 
 
 def _expand_defs(hed_string, hed_schema, def_dict):
-    from hed import HedString
     return str(HedString(hed_string, hed_schema, def_dict).expand_defs())