hed-standard · VisLab · Jul 4, 2023 · Jun 28, 2023 · Jun 29, 2023 · Jun 29, 2023
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -25,7 +25,7 @@ jobs:
       - uses: actions/cache@v3
         with:
           path: ${{ env.pythonLocation }}
-          key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('dev-requirements.txt') }}
+          key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('docs/requirements.txt') }}
 
       - name: Install dependencies
         run: |
@@ -85,7 +85,7 @@ jobs:
       - uses: actions/cache@v3
         with:
           path: ${{ env.pythonLocation }}
-          key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('dev-requirements.txt') }}
+          key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('docs/requirements.txt') }}
 
       - name: Install dependencies
         run: |

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,11 @@
+Release 0.3.1 July 3, 2023
+- Pinned the version of the pydantic and inflect libraries due to inflict.
+- Reorganized JSON output of remodeling summaries so that all of consistent form.
+- Fixed summarize_hed_tags_op so that tags were correctly categorized for output.
+- Minor refactoring to reduce code complexity.
+- BaseInput and Sidecar now raise HedFileError if input could not be read.
+
+
 Release 0.3.0 June 20, 2023
 - Introduction of partnered schema.
 - Improved error handling for schema validation.

diff --git a/LICENSE b/LICENSE
@@ -1,4 +1,4 @@
-MIT License
+The MIT License (MIT)
 
 Copyright (c) 2020+ HED Standard Working Group
 

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,11 +1,10 @@
 defusedxml>=0.7.1
-inflect>=6.0.2
-myst-parser>=0.18.1
+inflect>=6.0.5
 numpy>=1.21.6
 openpyxl>=3.1.0
 pandas>=1.3.5
 portalocker>=2.7.0
 semantic_version>=2.10.0
 Sphinx>=5.2.2
 sphinx_rtd_theme>=1.0.0
-wordcloud>=1.9.2
+wordcloud==1.9.2
diff --git a/hed/errors/exceptions.py b/hed/errors/exceptions.py
@@ -2,6 +2,7 @@
 
 
 class HedExceptions:
+    GENERIC_ERROR = 'GENERIC_ERROR'
     # A list of all exceptions that can be generated by the hedtools.
     FILE_NOT_FOUND = 'fileNotFound'
     BAD_PARAMETERS = 'badParameters'
@@ -10,7 +11,7 @@ class HedExceptions:
     INVALID_EXTENSION = 'invalidExtension'
 
     INVALID_DATAFRAME = 'INVALID_DATAFRAME'
-
+    INVALID_FILE_FORMAT = 'INVALID_FILE_FORMAT'
     # These are actual schema issues, not that the file cannot be found or parsed
     SCHEMA_HEADER_MISSING = 'HED_SCHEMA_HEADER_INVALID'
     HED_SCHEMA_HEADER_INVALID = 'HED_SCHEMA_HEADER_INVALID'

diff --git a/hed/models/base_input.py b/hed/models/base_input.py
@@ -43,12 +43,10 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T
             - An invalid dataframe was passed with size 0
             - An invalid extension was provided
             - A duplicate or empty column name appears
-
-        :raises OSError:
             - Cannot open the indicated file
-
-        :raises KeyError:
             - The specified worksheet name does not exist
+            - If the sidecar file or tabular file had invalid format and could not be read.
+
          """
         if mapper is None:
             mapper = ColumnMapper()
@@ -77,14 +75,20 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T
         elif not file:
             raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file passed to BaseInput.", file)
         elif input_type in self.TEXT_EXTENSION:
-            self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header,
-                                              dtype=str, keep_default_na=True, na_values=None)
+            try:
+                self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header,
+                                                  dtype=str, keep_default_na=True, na_values=None)
+            except Exception as e:
+                raise HedFileError(HedExceptions.INVALID_FILE_FORMAT, str(e), self.name) from e
             # Convert nan values to a known value
             self._dataframe = self._dataframe.fillna("n/a")
         elif input_type in self.EXCEL_EXTENSION:
-            self._loaded_workbook = openpyxl.load_workbook(file)
-            loaded_worksheet = self.get_worksheet(self._worksheet_name)
-            self._dataframe = self._get_dataframe_from_worksheet(loaded_worksheet, has_column_names)
+            try:
+                self._loaded_workbook = openpyxl.load_workbook(file)
+                loaded_worksheet = self.get_worksheet(self._worksheet_name)
+                self._dataframe = self._get_dataframe_from_worksheet(loaded_worksheet, has_column_names)
+            except Exception as e:
+                raise HedFileError(HedExceptions.GENERIC_ERROR, str(e), self.name) from e
         else:
             raise HedFileError(HedExceptions.INVALID_EXTENSION, "", file)
 
@@ -94,7 +98,7 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T
         # todo: Can we get rid of this behavior now that we're using pandas?
         column_issues = ColumnMapper.check_for_blank_names(self.columns, allow_blank_names=allow_blank_names)
         if column_issues:
-            raise HedFileError(HedExceptions.BAD_COLUMN_NAMES, "Duplicate or blank columns found.  See issues.",
+            raise HedFileError(HedExceptions.BAD_COLUMN_NAMES, "Duplicate or blank columns found. See issues.",
                                self.name, issues=column_issues)
 
         self.reset_mapper(mapper)
@@ -285,7 +289,7 @@ def set_cell(self, row_number, column_number, new_string_obj, tag_form="short_ta
 
         Notes:
              Any attribute of a HedTag that returns a string is a valid value of tag_form.
-             
+
         :raises ValueError:
             - There is not a loaded dataframe
 

diff --git a/hed/models/sidecar.py b/hed/models/sidecar.py
@@ -127,15 +127,13 @@ def load_sidecar_file(self, file):
         if not file:
             return {}
         elif isinstance(file, str):
+            if not self.name:
+                self.name = file
             try:
                 with open(file, "r") as fp:
-                    if not self.name:
-                        self.name = file
                     return self._load_json_file(fp)
-            except FileNotFoundError as e:
-                raise HedFileError(HedExceptions.FILE_NOT_FOUND, e.strerror, file)
-            except TypeError as e:
-                raise HedFileError(HedExceptions.FILE_NOT_FOUND, str(e), file)
+            except OSError as e:
+                raise HedFileError(HedExceptions.FILE_NOT_FOUND, e.strerror, file) from e
         else:
             return self._load_json_file(file)
 
@@ -189,12 +187,11 @@ def _load_json_file(self, fp):
 
         :raises HedFileError:
             - If the file cannot be parsed.
-
         """
         try:
             return json.load(fp)
-        except json.decoder.JSONDecodeError as e:
-            raise HedFileError(HedExceptions.CANNOT_PARSE_JSON, str(e), self.name)
+        except (json.decoder.JSONDecodeError, AttributeError) as e:
+            raise HedFileError(HedExceptions.CANNOT_PARSE_JSON, str(e), self.name) from e
 
     def extract_definitions(self, hed_schema=None, error_handler=None):
         """ Gather and validate definitions in metadata.

diff --git a/hed/models/tabular_input.py b/hed/models/tabular_input.py
@@ -13,8 +13,8 @@ def __init__(self, file=None, sidecar=None, name=None):
         """ Constructor for the TabularInput class.
 
         Parameters:
-            file (str or file like): A tsv file to open.
-            sidecar (str or Sidecar): A Sidecar filename or Sidecar
+            file (str or FileLike): A tsv file to open.
+            sidecar (str or Sidecar or FileLike): A Sidecar or source file/filename.
             name (str): The name to display for this file for error purposes.
 
         :raises HedFileError:

diff --git a/hed/tools/analysis/hed_context_manager.py b/hed/tools/analysis/hed_context_manager.py
@@ -5,7 +5,8 @@
 from hed.schema import HedSchema, HedSchemaGroup
 from hed.tools.analysis.analysis_util import hed_to_str
 
-#TODO: [Refactor] clean up distinction between hed as strings versus objects -- maybe replace by event manager.
+# TODO: [Refactor] clean up distinction between hed as strings versus objects -- maybe replace by event manager.
+# TODO: Implement insets
 
 class OnsetGroup:
     def __init__(self, name, contents, start_index, end_index=None):

diff --git a/hed/tools/analysis/hed_tag_counts.py b/hed/tools/analysis/hed_tag_counts.py
@@ -21,11 +21,11 @@ def __init__(self, hed_tag, file_name):
         self.set_value(hed_tag)
 
     def set_value(self, hed_tag):
-        """ Update the tag term value counts for a HedTag. 
-        
+        """ Update the tag term value counts for a HedTag.
+
         Parameters:
-            hed_tag (HedTag or None):  Item to use to update the value counts. 
-        
+            hed_tag (HedTag or None):  Item to use to update the value counts.
+
         """
         if not hed_tag:
             return
@@ -43,13 +43,13 @@ def get_info(self, verbose=False):
         else:
             files = len(self.files)
         return {'tag': self.tag, 'events': self.events, 'files': files}
-    
+
     def get_summary(self):
         """ Return a dictionary summary of the events and files for this tag.
-        
+
         Returns:
             dict:  dictionary summary of events and files that contain this tag.
-        
+
         """
         return {'tag': self.tag, 'events': self.events, 'files': [name for name in self.files]}
 
@@ -63,28 +63,27 @@ def get_empty(self):
 
 class HedTagCounts:
     """ Counts of HED tags for a tabular file.
-    
+
     Parameters:
         name (str):  An identifier for these counts (usually the filename of the tabular file)
         total_events (int):  The total number of events in the tabular file.
 
-
     """
 
     def __init__(self, name, total_events=0):
         self.tag_dict = {}
         self.name = name
         self.files = {}
         self.total_events = total_events
-     
+
     def update_event_counts(self, hed_string_obj, file_name, definitions=None):
-        """ Update the tag counts based on a hed string object. 
-        
+        """ Update the tag counts based on a hed string object.
+
         Parameters:
             hed_string_obj (HedString): The HED string whose tags should be counted.
             file_name (str): The name of the file corresponding to these counts.
             definitions (dict): The definitions associated with the HED string.
-            
+
         """
         if file_name not in self.files:
             self.files[file_name] = ""
@@ -100,38 +99,42 @@ def update_event_counts(self, hed_string_obj, file_name, definitions=None):
         self.merge_tag_dicts(tag_dict)
 
     def organize_tags(self, tag_template):
+        """ Organize tags into categories as specified by the tag_template.
+
+        Parameters:
+            tag_template (dict): A dictionary whose keys are titles and values are lists of HED tags (str).
+
+        Returns:
+            dict  - keys are tags (strings) and values are list of HedTagCount for items fitting template.
+            list - of HedTagCount objects corresponding to tags that don't fit the template.
+
+        """
         template = self.create_template(tag_template)
         unmatched = []
-        for key, tag_count in self.tag_dict.items():
-            matched = False
-            for tag in reversed(tag_count.tag_terms):
-                if tag in template:
-                    template[tag].append(tag_count)
-                    matched = True
-                    break
-            if not matched:
-                unmatched.append(tag_count)
+        for tag_count in self.tag_dict.values():
+            self._update_template(tag_count, template, unmatched)
         return template, unmatched
 
     def merge_tag_dicts(self, other_dict):
         for tag, count in other_dict.items():
             if tag not in self.tag_dict:
                 self.tag_dict[tag] = count.get_empty()
             self.tag_dict[tag].events = self.tag_dict[tag].events + count.events
-            value_dict = self.tag_dict[tag].value_dict
-            for value, val_count in count.value_dict.items():
-                if value in value_dict:
-                    value_dict[value] = value_dict[value] + val_count
-                else:
-                    value_dict[value] = val_count
             for file in count.files:
                 self.tag_dict[tag].files[file] = ''
+            if not self.tag_dict[tag].value_dict:
+                continue
+            for value, val_count in count.value_dict.items():
+                if value in self.tag_dict[tag].value_dict:
+                    self.tag_dict[tag].value_dict[value] = self.tag_dict[tag].value_dict + val_count
+                else:
+                    self.tag_dict[tag].value_dict[value] = val_count
 
     def get_summary(self):
         details = {}
         for tag, count in self.tag_dict.items():
             details[tag] = count.get_summary()
-        return {'name': str(self.name), 'type_tag': self.type_tag, 'files': list(self.files.keys()),
+        return {'name': str(self.name), 'files': list(self.files.keys()),
                 'total_events': self.total_events, 'details': details}
 
     @staticmethod
@@ -141,3 +144,19 @@ def create_template(tags):
             for element in key_list:
                 template_dict[element.lower()] = []
         return template_dict
+
+    @staticmethod
+    def _update_template(tag_count, template, unmatched):
+        """ Update the template or unmatched with info in the tag_count.
+
+        Parameters:
+            tag_count (HedTagCount): Information for a particular tag.
+            template (dict):  The 
+
+        """
+        tag_list = reversed(list(tag_count.tag_terms))
+        for tkey in tag_list:
+            if tkey in template.keys():
+                template[tkey].append(tag_count)
+                return
+        unmatched.append(tag_count)
diff --git a/hed/tools/analysis/hed_type_factors.py b/hed/tools/analysis/hed_type_factors.py
@@ -39,18 +39,18 @@ def get_factors(self, factor_encoding="one-hot"):
             DataFrame:   DataFrame containing the factor vectors as the columns.
 
         """
-        df = pd.DataFrame(0, index=range(self.number_elements), columns=[self.type_value])
-        df.loc[list(self.direct_indices.keys()), [self.type_value]] = 1
+
         if not self.levels:
+            df = pd.DataFrame(0, index=range(self.number_elements), columns=[self.type_value])
+            df.loc[list(self.direct_indices.keys()), [self.type_value]] = 1
             return df
 
         levels = list(self.levels.keys())
         levels_list = [f"{self.type_value}.{level}" for level in levels]
-        df_levels = pd.DataFrame(0, index=range(self.number_elements), columns=levels_list)
+        factors = pd.DataFrame(0, index=range(self.number_elements), columns=levels_list)
         for index, level in enumerate(levels):
             index_keys = list(self.levels[level].keys())
-            df_levels.loc[index_keys, [levels_list[index]]] = 1
-        factors = pd.concat([df, df_levels], axis=1)
+            factors.loc[index_keys, [levels_list[index]]] = 1
         if factor_encoding == "one-hot":
             return factors
         sum_factors = factors.sum(axis=1)

diff --git a/hed/tools/analysis/hed_type_manager.py b/hed/tools/analysis/hed_type_manager.py
@@ -44,19 +44,21 @@ def get_factor_vectors(self, type_tag, type_values=None, factor_encoding="one-ho
             factor_encoding (str):   Specifies type of factor encoding (one-hot or categorical).
 
         Returns:
-            DataFrame:   DataFrame containing the factor vectors as the columns.
+            DataFrame or None:   DataFrame containing the factor vectors as the columns.
 
         """
-        this_var = self.get_type_variable(type_tag)
+        this_var = self.get_type_variable(type_tag.lower())
         if this_var is None:
             return None
         variables = this_var.get_type_value_names()
-        if variables is None:
-            variables = type_values
-        df_list = [0]*len(variables)
-        for index, variable in enumerate(variables):
+        if not type_values:
+            type_values = variables
+        df_list = [0]*len(type_values)
+        for index, variable in enumerate(type_values):
             var_sum = this_var._type_value_map[variable]
             df_list[index] = var_sum.get_factors(factor_encoding=factor_encoding)
+        if not df_list:
+            return None
         return pd.concat(df_list, axis=1)
 
     def get_type_variable(self, type_tag):