Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial commit of saving/loading .tsv. #906

Merged
merged 1 commit into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,3 @@ myst-parser>=1.0.0
Sphinx>=5.2.2
sphinx_rtd_theme>=1.0.0
wordcloud==1.9.3
rdflib>=6
21 changes: 21 additions & 0 deletions hed/schema/hed_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from hed.schema.schema_io import schema_util
from hed.schema.schema_io.schema2xml import Schema2XML
from hed.schema.schema_io.schema2wiki import Schema2Wiki
from hed.schema.schema_io.schema2df import Schema2DF

# from hed.schema.schema_io.schema2owl import Schema2Owl
# from hed.schema.schema_io.owl_constants import ext_to_format
from hed.schema.hed_schema_section import (HedSchemaSection, HedSchemaTagSection, HedSchemaUnitClassSection,
Expand Down Expand Up @@ -298,6 +300,25 @@ def save_as_mediawiki(self, filename, save_merged=False):
opened_file.write(string)
opened_file.write('\n')

def save_as_dataframes(self, base_filename, save_merged=False):
""" Save as mediawiki to a file.

base_filename: str
save filename. A suffix will be added to most, e.g. _Tag
save_merged: bool
If True, this will save the schema as a merged schema if it is a "withStandard" schema.
If it is not a "withStandard" schema, this setting has no effect.

:raises OSError:
- File cannot be saved for some reason.
"""
output_dfs = Schema2DF.process_schema(self, save_merged)
base, base_ext = os.path.splitext(base_filename)
for suffix, dataframe in output_dfs.items():
filename = f"{base}_{suffix}.tsv"
with open(filename, mode='w', encoding='utf-8') as opened_file:
dataframe.to_csv(opened_file, sep='\t', index=False, header=True)

# def save_as_owl(self, filename, save_merged=False, file_format=None):
# """ Save as json to a file.
#
Expand Down
1 change: 1 addition & 0 deletions hed/schema/hed_schema_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ class HedKey:

# Node attributes
InLibrary = "inLibrary"
HedID = 'hedId'

# All known properties
BoolProperty = 'boolProperty'
Expand Down
7 changes: 7 additions & 0 deletions hed/schema/hed_schema_df_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Known tsv format suffixes

STRUCT_KEY = "Structure"
TAG_KEY = "Tag"

# todo: move more constants up here
hed_id_column = "hedId"
26 changes: 20 additions & 6 deletions hed/schema/hed_schema_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from hed.schema.schema_io.xml2schema import SchemaLoaderXML
from hed.schema.schema_io.wiki2schema import SchemaLoaderWiki
from hed.schema.schema_io.df2schema import SchemaLoaderDF
# from hed.schema.schema_io.owl2schema import SchemaLoaderOWL
from hed.schema import hed_cache

Expand All @@ -23,9 +24,11 @@ def from_string(schema_string, schema_format=".xml", schema_namespace=None, sche
""" Create a schema from the given string.

Parameters:
schema_string (str): An XML, mediawiki or OWL, file as a single long string
schema_string (str or dict): An XML, mediawiki or OWL, file as a single long string
If tsv, Must be a dict of spreadsheets as strings.
schema_format (str): The schema format of the source schema string.
Allowed normal values: .mediawiki, .xml
Allowed normal values: .mediawiki, .xml, .tsv
Note: tsv is in progress and has limited features
schema_namespace (str, None): The name_prefix all tags in this schema will accept.
schema(HedSchema or None): A hed schema to merge this new file into
It must be a with-standard schema with the same value.
Expand All @@ -46,13 +49,18 @@ def from_string(schema_string, schema_format=".xml", schema_namespace=None, sche
raise HedFileError(HedExceptions.BAD_PARAMETERS, "Empty string passed to HedSchema.from_string",
filename=name)

# Replace carriage returns with new lines since this might not be done by the caller
schema_string = schema_string.replace("\r\n", "\n")
if isinstance(schema_string, str):
# Replace carriage returns with new lines since this might not be done by the caller
schema_string = schema_string.replace("\r\n", "\n")

if schema_format.endswith(".xml"):
hed_schema = SchemaLoaderXML.load(schema_as_string=schema_string, schema=schema, name=name)
elif schema_format.endswith(".mediawiki"):
hed_schema = SchemaLoaderWiki.load(schema_as_string=schema_string, schema=schema, name=name)
elif schema_format.endswith(".tsv"):
if schema is not None:
raise HedFileError(HedExceptions.INVALID_HED_FORMAT, "Cannot pass a schema to merge into spreadsheet loading currently.", filename=name)
hed_schema = SchemaLoaderDF.load_spreadsheet(schema_as_strings=schema_string, name=name)
# elif schema_format:
# hed_schema = SchemaLoaderOWL.load(schema_as_string=schema_string, schema=schema, file_format=schema_format,
# name=name)
Expand All @@ -68,7 +76,9 @@ def load_schema(hed_path, schema_namespace=None, schema=None, name=None):
""" Load a schema from the given file or URL path.

Parameters:
hed_path (str): A filepath or url to open a schema from.
hed_path (str or dict): A filepath or url to open a schema from.
If loading a TSV file, this can be a single filename template, or a dict of filenames.
Template: basename.tsv, where files are named basename_Struct.tsv and basename_Tag.tsv
schema_namespace (str or None): The name_prefix all tags in this schema will accept.
schema(HedSchema or None): A hed schema to merge this new file into
It must be a with-standard schema with the same value.
Expand All @@ -87,7 +97,6 @@ def load_schema(hed_path, schema_namespace=None, schema=None, name=None):
raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file path passed to HedSchema.load_file",
filename=hed_path)

ext = os.path.splitext(hed_path.lower())[1]
is_url = hed_cache._check_if_url(hed_path)
if is_url:
try:
Expand All @@ -103,6 +112,11 @@ def load_schema(hed_path, schema_namespace=None, schema=None, name=None):
hed_schema = SchemaLoaderXML.load(hed_path, schema=schema, name=name)
elif hed_path.lower().endswith(".mediawiki"):
hed_schema = SchemaLoaderWiki.load(hed_path, schema=schema, name=name)
elif hed_path.lower().endswith(".tsv"):
if schema is not None:
raise HedFileError(HedExceptions.INVALID_HED_FORMAT,
"Cannot pass a schema to merge into spreadsheet loading currently.", filename=name)
hed_schema = SchemaLoaderDF.load_spreadsheet(filenames=hed_path, name=name)
else:
raise HedFileError(HedExceptions.INVALID_EXTENSION, "Unknown schema extension", filename=hed_path)

Expand Down
78 changes: 78 additions & 0 deletions hed/schema/schema_io/base2schema.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
import copy
import re

from hed.errors.exceptions import HedFileError, HedExceptions
from hed.errors.error_types import ErrorContext
from hed.schema import HedSchema, hed_schema_constants as constants
from hed.schema.hed_schema_constants import HedKey
from abc import abstractmethod, ABC
from hed.schema import schema_header_util
from hed.schema import hed_schema_constants

# Might need separate version again for wiki
header_attr_expression = "([^ ,]+?)=\"(.*?)\""
attr_re = re.compile(header_attr_expression)


class SchemaLoader(ABC):
""" Baseclass for schema loading, to handle basic errors and partnered schemas
Expand Down Expand Up @@ -70,6 +76,7 @@ def __init__(self, filename, schema_as_string=None, schema=None, file_format=Non
self._schema.filename = filename
self._schema.header_attributes = hed_attributes
self._loading_merged = False
self.fatal_errors = []

@property
def schema(self):
Expand Down Expand Up @@ -203,3 +210,74 @@ def find_rooted_entry(tag_entry, schema, loading_merged):
return None

return rooted_entry

def _add_fatal_error(self, line_number, line, warning_message="Schema term is empty or the line is malformed",
error_code=HedExceptions.WIKI_DELIMITERS_INVALID):

self.fatal_errors += self._format_error(line_number, line, warning_message, error_code)


@staticmethod
def _format_error(row_number, row, warning_message="Schema term is empty or the line is malformed",
error_code=HedExceptions.GENERIC_ERROR):
error = {'code': error_code,
ErrorContext.ROW: row_number,
ErrorContext.LINE: str(row),
"message": f"{warning_message}"
}

return [error]

# Below here are generic string loading functions, used by wiki and spreadsheet formats.
@staticmethod
def _validate_attribute_string(attribute_string):
pattern = r'^[A-Za-z]+(=.+)?$'
match = re.fullmatch(pattern, attribute_string)
if match:
return match.group()

def _parse_attribute_string(self, row_number, attr_string):
if attr_string:
attributes_split = [x.strip() for x in attr_string.split(',')]

final_attributes = {}
for attribute in attributes_split:
if self._validate_attribute_string(attribute) is None:
self._add_fatal_error(row_number, attr_string,
f"Malformed attribute found {attribute}. "
f"Valid formatting is: attribute, or attribute=\"value\".")
continue
split_attribute = attribute.split("=")
if len(split_attribute) == 1:
final_attributes[split_attribute[0]] = True
else:
if split_attribute[0] in final_attributes:
final_attributes[split_attribute[0]] += "," + split_attribute[1]
else:
final_attributes[split_attribute[0]] = split_attribute[1]
return final_attributes
else:
return {}

@staticmethod
def _parse_attributes_line(version_line):
matches = {}
unmatched = []
last_end = 0

for match in attr_re.finditer(version_line):
start, end = match.span()

# If there's unmatched content between the last match and the current one.
if start > last_end:
unmatched.append(version_line[last_end:start])

matches[match.group(1)] = match.group(2)
last_end = end

# If there's unmatched content after the last match
if last_end < len(version_line):
unmatched.append(version_line[last_end:])

unmatched = [m.strip() for m in unmatched if m.strip()]
return matches, unmatched
Loading
Loading