Skip to content

Commit

Permalink
Merge pull request #906 from IanCa/develop
Browse files Browse the repository at this point in the history
Initial commit of saving/loading .tsv.
  • Loading branch information
VisLab authored Apr 18, 2024
2 parents f9fcc80 + 151266e commit cec8527
Show file tree
Hide file tree
Showing 15 changed files with 771 additions and 129 deletions.
1 change: 0 additions & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,3 @@ myst-parser>=1.0.0
Sphinx>=5.2.2
sphinx_rtd_theme>=1.0.0
wordcloud==1.9.3
rdflib>=6
21 changes: 21 additions & 0 deletions hed/schema/hed_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from hed.schema.schema_io import schema_util
from hed.schema.schema_io.schema2xml import Schema2XML
from hed.schema.schema_io.schema2wiki import Schema2Wiki
from hed.schema.schema_io.schema2df import Schema2DF

# from hed.schema.schema_io.schema2owl import Schema2Owl
# from hed.schema.schema_io.owl_constants import ext_to_format
from hed.schema.hed_schema_section import (HedSchemaSection, HedSchemaTagSection, HedSchemaUnitClassSection,
Expand Down Expand Up @@ -298,6 +300,25 @@ def save_as_mediawiki(self, filename, save_merged=False):
opened_file.write(string)
opened_file.write('\n')

def save_as_dataframes(self, base_filename, save_merged=False):
""" Save as mediawiki to a file.
base_filename: str
save filename. A suffix will be added to most, e.g. _Tag
save_merged: bool
If True, this will save the schema as a merged schema if it is a "withStandard" schema.
If it is not a "withStandard" schema, this setting has no effect.
:raises OSError:
- File cannot be saved for some reason.
"""
output_dfs = Schema2DF.process_schema(self, save_merged)
base, base_ext = os.path.splitext(base_filename)
for suffix, dataframe in output_dfs.items():
filename = f"{base}_{suffix}.tsv"
with open(filename, mode='w', encoding='utf-8') as opened_file:
dataframe.to_csv(opened_file, sep='\t', index=False, header=True)

# def save_as_owl(self, filename, save_merged=False, file_format=None):
# """ Save as json to a file.
#
Expand Down
1 change: 1 addition & 0 deletions hed/schema/hed_schema_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ class HedKey:

# Node attributes
InLibrary = "inLibrary"
HedID = 'hedId'

# All known properties
BoolProperty = 'boolProperty'
Expand Down
7 changes: 7 additions & 0 deletions hed/schema/hed_schema_df_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Known tsv format suffixes

STRUCT_KEY = "Structure"
TAG_KEY = "Tag"

# todo: move more constants up here
hed_id_column = "hedId"
26 changes: 20 additions & 6 deletions hed/schema/hed_schema_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from hed.schema.schema_io.xml2schema import SchemaLoaderXML
from hed.schema.schema_io.wiki2schema import SchemaLoaderWiki
from hed.schema.schema_io.df2schema import SchemaLoaderDF
# from hed.schema.schema_io.owl2schema import SchemaLoaderOWL
from hed.schema import hed_cache

Expand All @@ -23,9 +24,11 @@ def from_string(schema_string, schema_format=".xml", schema_namespace=None, sche
""" Create a schema from the given string.
Parameters:
schema_string (str): An XML, mediawiki or OWL, file as a single long string
schema_string (str or dict): An XML, mediawiki or OWL, file as a single long string
If tsv, Must be a dict of spreadsheets as strings.
schema_format (str): The schema format of the source schema string.
Allowed normal values: .mediawiki, .xml
Allowed normal values: .mediawiki, .xml, .tsv
Note: tsv is in progress and has limited features
schema_namespace (str, None): The name_prefix all tags in this schema will accept.
schema(HedSchema or None): A hed schema to merge this new file into
It must be a with-standard schema with the same value.
Expand All @@ -46,13 +49,18 @@ def from_string(schema_string, schema_format=".xml", schema_namespace=None, sche
raise HedFileError(HedExceptions.BAD_PARAMETERS, "Empty string passed to HedSchema.from_string",
filename=name)

# Replace carriage returns with new lines since this might not be done by the caller
schema_string = schema_string.replace("\r\n", "\n")
if isinstance(schema_string, str):
# Replace carriage returns with new lines since this might not be done by the caller
schema_string = schema_string.replace("\r\n", "\n")

if schema_format.endswith(".xml"):
hed_schema = SchemaLoaderXML.load(schema_as_string=schema_string, schema=schema, name=name)
elif schema_format.endswith(".mediawiki"):
hed_schema = SchemaLoaderWiki.load(schema_as_string=schema_string, schema=schema, name=name)
elif schema_format.endswith(".tsv"):
if schema is not None:
raise HedFileError(HedExceptions.INVALID_HED_FORMAT, "Cannot pass a schema to merge into spreadsheet loading currently.", filename=name)
hed_schema = SchemaLoaderDF.load_spreadsheet(schema_as_strings=schema_string, name=name)
# elif schema_format:
# hed_schema = SchemaLoaderOWL.load(schema_as_string=schema_string, schema=schema, file_format=schema_format,
# name=name)
Expand All @@ -68,7 +76,9 @@ def load_schema(hed_path, schema_namespace=None, schema=None, name=None):
""" Load a schema from the given file or URL path.
Parameters:
hed_path (str): A filepath or url to open a schema from.
hed_path (str or dict): A filepath or url to open a schema from.
If loading a TSV file, this can be a single filename template, or a dict of filenames.
Template: basename.tsv, where files are named basename_Struct.tsv and basename_Tag.tsv
schema_namespace (str or None): The name_prefix all tags in this schema will accept.
schema(HedSchema or None): A hed schema to merge this new file into
It must be a with-standard schema with the same value.
Expand All @@ -87,7 +97,6 @@ def load_schema(hed_path, schema_namespace=None, schema=None, name=None):
raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file path passed to HedSchema.load_file",
filename=hed_path)

ext = os.path.splitext(hed_path.lower())[1]
is_url = hed_cache._check_if_url(hed_path)
if is_url:
try:
Expand All @@ -103,6 +112,11 @@ def load_schema(hed_path, schema_namespace=None, schema=None, name=None):
hed_schema = SchemaLoaderXML.load(hed_path, schema=schema, name=name)
elif hed_path.lower().endswith(".mediawiki"):
hed_schema = SchemaLoaderWiki.load(hed_path, schema=schema, name=name)
elif hed_path.lower().endswith(".tsv"):
if schema is not None:
raise HedFileError(HedExceptions.INVALID_HED_FORMAT,
"Cannot pass a schema to merge into spreadsheet loading currently.", filename=name)
hed_schema = SchemaLoaderDF.load_spreadsheet(filenames=hed_path, name=name)
else:
raise HedFileError(HedExceptions.INVALID_EXTENSION, "Unknown schema extension", filename=hed_path)

Expand Down
78 changes: 78 additions & 0 deletions hed/schema/schema_io/base2schema.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
import copy
import re

from hed.errors.exceptions import HedFileError, HedExceptions
from hed.errors.error_types import ErrorContext
from hed.schema import HedSchema, hed_schema_constants as constants
from hed.schema.hed_schema_constants import HedKey
from abc import abstractmethod, ABC
from hed.schema import schema_header_util
from hed.schema import hed_schema_constants

# Might need separate version again for wiki
header_attr_expression = "([^ ,]+?)=\"(.*?)\""
attr_re = re.compile(header_attr_expression)


class SchemaLoader(ABC):
""" Baseclass for schema loading, to handle basic errors and partnered schemas
Expand Down Expand Up @@ -70,6 +76,7 @@ def __init__(self, filename, schema_as_string=None, schema=None, file_format=Non
self._schema.filename = filename
self._schema.header_attributes = hed_attributes
self._loading_merged = False
self.fatal_errors = []

@property
def schema(self):
Expand Down Expand Up @@ -203,3 +210,74 @@ def find_rooted_entry(tag_entry, schema, loading_merged):
return None

return rooted_entry

def _add_fatal_error(self, line_number, line, warning_message="Schema term is empty or the line is malformed",
error_code=HedExceptions.WIKI_DELIMITERS_INVALID):

self.fatal_errors += self._format_error(line_number, line, warning_message, error_code)


@staticmethod
def _format_error(row_number, row, warning_message="Schema term is empty or the line is malformed",
error_code=HedExceptions.GENERIC_ERROR):
error = {'code': error_code,
ErrorContext.ROW: row_number,
ErrorContext.LINE: str(row),
"message": f"{warning_message}"
}

return [error]

# Below here are generic string loading functions, used by wiki and spreadsheet formats.
@staticmethod
def _validate_attribute_string(attribute_string):
pattern = r'^[A-Za-z]+(=.+)?$'
match = re.fullmatch(pattern, attribute_string)
if match:
return match.group()

def _parse_attribute_string(self, row_number, attr_string):
if attr_string:
attributes_split = [x.strip() for x in attr_string.split(',')]

final_attributes = {}
for attribute in attributes_split:
if self._validate_attribute_string(attribute) is None:
self._add_fatal_error(row_number, attr_string,
f"Malformed attribute found {attribute}. "
f"Valid formatting is: attribute, or attribute=\"value\".")
continue
split_attribute = attribute.split("=")
if len(split_attribute) == 1:
final_attributes[split_attribute[0]] = True
else:
if split_attribute[0] in final_attributes:
final_attributes[split_attribute[0]] += "," + split_attribute[1]
else:
final_attributes[split_attribute[0]] = split_attribute[1]
return final_attributes
else:
return {}

@staticmethod
def _parse_attributes_line(version_line):
matches = {}
unmatched = []
last_end = 0

for match in attr_re.finditer(version_line):
start, end = match.span()

# If there's unmatched content between the last match and the current one.
if start > last_end:
unmatched.append(version_line[last_end:start])

matches[match.group(1)] = match.group(2)
last_end = end

# If there's unmatched content after the last match
if last_end < len(version_line):
unmatched.append(version_line[last_end:])

unmatched = [m.strip() for m in unmatched if m.strip()]
return matches, unmatched
Loading

0 comments on commit cec8527

Please sign in to comment.