Merge pull request #906 from IanCa/develop

Initial commit of saving/loading .tsv.
hed-standard · Apr 18, 2024 · cec8527 · cec8527
2 parents f9fcc80 + 151266e
commit cec8527
Show file tree

Hide file tree

Showing 15 changed files with 771 additions and 129 deletions.
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -10,4 +10,3 @@ myst-parser>=1.0.0
 Sphinx>=5.2.2
 sphinx_rtd_theme>=1.0.0
 wordcloud==1.9.3
-rdflib>=6
diff --git a/hed/schema/hed_schema.py b/hed/schema/hed_schema.py
@@ -6,6 +6,8 @@
 from hed.schema.schema_io import schema_util
 from hed.schema.schema_io.schema2xml import Schema2XML
 from hed.schema.schema_io.schema2wiki import Schema2Wiki
+from hed.schema.schema_io.schema2df import Schema2DF
+
 # from hed.schema.schema_io.schema2owl import Schema2Owl
 # from hed.schema.schema_io.owl_constants import ext_to_format
 from hed.schema.hed_schema_section import (HedSchemaSection, HedSchemaTagSection, HedSchemaUnitClassSection,
@@ -298,6 +300,25 @@ def save_as_mediawiki(self, filename, save_merged=False):
                 opened_file.write(string)
                 opened_file.write('\n')
 
+    def save_as_dataframes(self, base_filename, save_merged=False):
+        """ Save as mediawiki to a file.
+
+        base_filename: str
+            save filename.  A suffix will be added to most, e.g. _Tag
+        save_merged: bool
+            If True, this will save the schema as a merged schema if it is a "withStandard" schema.
+            If it is not a "withStandard" schema, this setting has no effect.
+
+        :raises OSError:
+            - File cannot be saved for some reason.
+        """
+        output_dfs = Schema2DF.process_schema(self, save_merged)
+        base, base_ext = os.path.splitext(base_filename)
+        for suffix, dataframe in output_dfs.items():
+            filename = f"{base}_{suffix}.tsv"
+            with open(filename, mode='w', encoding='utf-8') as opened_file:
+                dataframe.to_csv(opened_file, sep='\t', index=False, header=True)
+
     # def save_as_owl(self, filename, save_merged=False, file_format=None):
     #     """ Save as json to a file.
     #

diff --git a/hed/schema/hed_schema_constants.py b/hed/schema/hed_schema_constants.py
@@ -59,6 +59,7 @@ class HedKey:
 
     # Node attributes
     InLibrary = "inLibrary"
+    HedID = 'hedId'
 
     # All known properties
     BoolProperty = 'boolProperty'

diff --git a/hed/schema/hed_schema_df_constants.py b/hed/schema/hed_schema_df_constants.py
@@ -0,0 +1,7 @@
+# Known tsv format suffixes
+
+STRUCT_KEY = "Structure"
+TAG_KEY = "Tag"
+
+# todo: move more constants up here
+hed_id_column = "hedId"
diff --git a/hed/schema/hed_schema_io.py b/hed/schema/hed_schema_io.py
@@ -5,6 +5,7 @@
 
 from hed.schema.schema_io.xml2schema import SchemaLoaderXML
 from hed.schema.schema_io.wiki2schema import SchemaLoaderWiki
+from hed.schema.schema_io.df2schema import SchemaLoaderDF
 # from hed.schema.schema_io.owl2schema import SchemaLoaderOWL
 from hed.schema import hed_cache
 
@@ -23,9 +24,11 @@ def from_string(schema_string, schema_format=".xml", schema_namespace=None, sche
     """ Create a schema from the given string.
 
     Parameters:
-        schema_string (str):         An XML, mediawiki or OWL, file as a single long string
+        schema_string (str or dict): An XML, mediawiki or OWL, file as a single long string
+            If tsv, Must be a dict of spreadsheets as strings.
         schema_format (str):         The schema format of the source schema string.
-            Allowed normal values: .mediawiki, .xml
+            Allowed normal values: .mediawiki, .xml, .tsv
+            Note: tsv is in progress and has limited features
         schema_namespace (str, None):  The name_prefix all tags in this schema will accept.
         schema(HedSchema or None): A hed schema to merge this new file into
                                    It must be a with-standard schema with the same value.
@@ -46,13 +49,18 @@ def from_string(schema_string, schema_format=".xml", schema_namespace=None, sche
         raise HedFileError(HedExceptions.BAD_PARAMETERS, "Empty string passed to HedSchema.from_string",
                            filename=name)
 
-    # Replace carriage returns with new lines since this might not be done by the caller
-    schema_string = schema_string.replace("\r\n", "\n")
+    if isinstance(schema_string, str):
+        # Replace carriage returns with new lines since this might not be done by the caller
+        schema_string = schema_string.replace("\r\n", "\n")
 
     if schema_format.endswith(".xml"):
         hed_schema = SchemaLoaderXML.load(schema_as_string=schema_string, schema=schema, name=name)
     elif schema_format.endswith(".mediawiki"):
         hed_schema = SchemaLoaderWiki.load(schema_as_string=schema_string, schema=schema, name=name)
+    elif schema_format.endswith(".tsv"):
+        if schema is not None:
+            raise HedFileError(HedExceptions.INVALID_HED_FORMAT, "Cannot pass a schema to merge into spreadsheet loading currently.", filename=name)
+        hed_schema = SchemaLoaderDF.load_spreadsheet(schema_as_strings=schema_string, name=name)
     # elif schema_format:
     #     hed_schema = SchemaLoaderOWL.load(schema_as_string=schema_string, schema=schema, file_format=schema_format,
     #                                       name=name)
@@ -68,7 +76,9 @@ def load_schema(hed_path, schema_namespace=None, schema=None, name=None):
     """ Load a schema from the given file or URL path.
 
     Parameters:
-        hed_path (str): A filepath or url to open a schema from.
+        hed_path (str or dict): A filepath or url to open a schema from.
+            If loading a TSV file, this can be a single filename template, or a dict of filenames.
+            Template: basename.tsv, where files are named basename_Struct.tsv and basename_Tag.tsv
         schema_namespace (str or None): The name_prefix all tags in this schema will accept.
         schema(HedSchema or None): A hed schema to merge this new file into
                                    It must be a with-standard schema with the same value.
@@ -87,7 +97,6 @@ def load_schema(hed_path, schema_namespace=None, schema=None, name=None):
         raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file path passed to HedSchema.load_file",
                            filename=hed_path)
 
-    ext = os.path.splitext(hed_path.lower())[1]
     is_url = hed_cache._check_if_url(hed_path)
     if is_url:
         try:
@@ -103,6 +112,11 @@ def load_schema(hed_path, schema_namespace=None, schema=None, name=None):
         hed_schema = SchemaLoaderXML.load(hed_path, schema=schema, name=name)
     elif hed_path.lower().endswith(".mediawiki"):
         hed_schema = SchemaLoaderWiki.load(hed_path, schema=schema, name=name)
+    elif hed_path.lower().endswith(".tsv"):
+        if schema is not None:
+            raise HedFileError(HedExceptions.INVALID_HED_FORMAT,
+                               "Cannot pass a schema to merge into spreadsheet loading currently.", filename=name)
+        hed_schema = SchemaLoaderDF.load_spreadsheet(filenames=hed_path, name=name)
     else:
         raise HedFileError(HedExceptions.INVALID_EXTENSION, "Unknown schema extension", filename=hed_path)
 

diff --git a/hed/schema/schema_io/base2schema.py b/hed/schema/schema_io/base2schema.py
@@ -1,12 +1,18 @@
 import copy
+import re
 
 from hed.errors.exceptions import HedFileError, HedExceptions
+from hed.errors.error_types import ErrorContext
 from hed.schema import HedSchema, hed_schema_constants as constants
 from hed.schema.hed_schema_constants import HedKey
 from abc import abstractmethod, ABC
 from hed.schema import schema_header_util
 from hed.schema import hed_schema_constants
 
+# Might need separate version again for wiki
+header_attr_expression = "([^ ,]+?)=\"(.*?)\""
+attr_re = re.compile(header_attr_expression)
+
 
 class SchemaLoader(ABC):
     """ Baseclass for schema loading, to handle basic errors and partnered schemas
@@ -70,6 +76,7 @@ def __init__(self, filename, schema_as_string=None, schema=None, file_format=Non
         self._schema.filename = filename
         self._schema.header_attributes = hed_attributes
         self._loading_merged = False
+        self.fatal_errors = []
 
     @property
     def schema(self):
@@ -203,3 +210,74 @@ def find_rooted_entry(tag_entry, schema, loading_merged):
                 return None
 
             return rooted_entry
+
+    def _add_fatal_error(self, line_number, line, warning_message="Schema term is empty or the line is malformed",
+                         error_code=HedExceptions.WIKI_DELIMITERS_INVALID):
+
+        self.fatal_errors += self._format_error(line_number, line, warning_message, error_code)
+
+
+    @staticmethod
+    def _format_error(row_number, row, warning_message="Schema term is empty or the line is malformed",
+                      error_code=HedExceptions.GENERIC_ERROR):
+        error = {'code': error_code,
+                 ErrorContext.ROW: row_number,
+                 ErrorContext.LINE: str(row),
+                 "message": f"{warning_message}"
+                 }
+
+        return [error]
+
+    # Below here are generic string loading functions, used by wiki and spreadsheet formats.
+    @staticmethod
+    def _validate_attribute_string(attribute_string):
+        pattern = r'^[A-Za-z]+(=.+)?$'
+        match = re.fullmatch(pattern, attribute_string)
+        if match:
+            return match.group()
+
+    def _parse_attribute_string(self, row_number, attr_string):
+        if attr_string:
+            attributes_split = [x.strip() for x in attr_string.split(',')]
+
+            final_attributes = {}
+            for attribute in attributes_split:
+                if self._validate_attribute_string(attribute) is None:
+                    self._add_fatal_error(row_number, attr_string,
+                                          f"Malformed attribute found {attribute}.  "
+                                          f"Valid formatting is: attribute, or attribute=\"value\".")
+                    continue
+                split_attribute = attribute.split("=")
+                if len(split_attribute) == 1:
+                    final_attributes[split_attribute[0]] = True
+                else:
+                    if split_attribute[0] in final_attributes:
+                        final_attributes[split_attribute[0]] += "," + split_attribute[1]
+                    else:
+                        final_attributes[split_attribute[0]] = split_attribute[1]
+            return final_attributes
+        else:
+            return {}
+
+    @staticmethod
+    def _parse_attributes_line(version_line):
+        matches = {}
+        unmatched = []
+        last_end = 0
+
+        for match in attr_re.finditer(version_line):
+            start, end = match.span()
+
+            # If there's unmatched content between the last match and the current one.
+            if start > last_end:
+                unmatched.append(version_line[last_end:start])
+
+            matches[match.group(1)] = match.group(2)
+            last_end = end
+
+        # If there's unmatched content after the last match
+        if last_end < len(version_line):
+            unmatched.append(version_line[last_end:])
+
+        unmatched = [m.strip() for m in unmatched if m.strip()]
+        return matches, unmatched