Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preliminary implement of definition summary #641

Merged
merged 4 commits into from
Mar 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions hed/tools/remodeling/operations/summarize_definitions_op.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
""" Summarize the values in the columns of a tabular file. """

from hed import DefinitionDict, TabularInput, Sidecar
from hed.models.df_util import process_def_expands
from hed.tools.analysis.analysis_util import assemble_hed
from hed.tools.remodeling.operations.base_op import BaseOp
from hed.tools.remodeling.operations.base_context import BaseContext


class SummarizeDefinitionsOp(BaseOp):
""" Summarize the values in the columns of a tabular file.

Required remodeling parameters:
- **summary_name** (*str*): The name of the summary.
- **summary_filename** (*str*): Base filename of the summary.

The purpose is to produce a summary of the values in a tabular file.

"""

PARAMS = {
"operation": "summarize_definitions",
"required_parameters": {
"summary_name": str,
"summary_filename": str
},
"optional_parameters": {
}
}

SUMMARY_TYPE = 'definitions'

def __init__(self, parameters):
""" Constructor for the summarize column values operation.

Parameters:
parameters (dict): Dictionary with the parameter values for required and optional parameters.

Raises:

KeyError
- If a required parameter is missing.
- If an unexpected parameter is provided.

TypeError
- If a parameter has the wrong type.

"""

super().__init__(self.PARAMS, parameters)
self.summary_name = parameters['summary_name']
self.summary_filename = parameters['summary_filename']

def do_op(self, dispatcher, df, name, sidecar=None):
""" Create factor columns corresponding to values in a specified column.

Parameters:
dispatcher (Dispatcher): Manages the operation I/O.
df (DataFrame): The DataFrame to be remodeled.
name (str): Unique identifier for the dataframe -- often the original file path.
sidecar (Sidecar or file-like): Only needed for HED operations.

Returns:
DataFrame: A new DataFrame with the factor columns appended.

Side-effect:
Updates the context.

"""

summary = dispatcher.context_dict.get(self.summary_name, None)
if not summary:
summary = DefinitionSummaryContext(self)
dispatcher.context_dict[self.summary_name] = summary
summary.update_context({'df': dispatcher.post_proc_data(df), 'name': name, 'sidecar': sidecar,
'schema': dispatcher.hed_schema})
return df


class DefinitionSummaryContext(BaseContext):

def __init__(self, sum_op):
super().__init__(sum_op.SUMMARY_TYPE, sum_op.summary_name, sum_op.summary_filename)
self.defs = DefinitionDict()
self.unresolved = {}
self.errors = {}

def update_context(self, new_context):
name = new_context['name']
data_input = TabularInput(new_context['df'], sidecar=new_context['sidecar'], name=new_context['name'])
sidecar = Sidecar(new_context['sidecar'])
df, _ = assemble_hed(data_input, sidecar, new_context['schema'],
columns_included=None, expand_defs=True)
hed_strings = df['HED_assembled']
self.defs, self.unresolved, errors = process_def_expands(hed_strings, new_context['schema'],
known_defs=self.defs, ambiguous_defs=self.unresolved)
self.errors.update(errors)

def _get_summary_details(self, summary):
return None

def _merge_all(self):
return None

def _get_result_string(self, name, result, indent=BaseContext.DISPLAY_INDENT):
if name == "Dataset":
return self._get_dataset_string(result, indent=indent)
return self._get_individual_string(name, result, indent=indent)

@staticmethod
def _get_dataset_string(result, indent=BaseContext.DISPLAY_INDENT):
return ""

@staticmethod
def _get_individual_string(name, result, indent=BaseContext.DISPLAY_INDENT):
return ""
2 changes: 2 additions & 0 deletions hed/tools/remodeling/operations/valid_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from hed.tools.remodeling.operations.split_rows_op import SplitRowsOp
from hed.tools.remodeling.operations.summarize_column_names_op import SummarizeColumnNamesOp
from hed.tools.remodeling.operations.summarize_column_values_op import SummarizeColumnValuesOp
from hed.tools.remodeling.operations.summarize_definitions_op import SummarizeDefinitionsOp
from hed.tools.remodeling.operations.summarize_sidecar_from_events_op import SummarizeSidecarFromEventsOp
from hed.tools.remodeling.operations.summarize_hed_type_op import SummarizeHedTypeOp
from hed.tools.remodeling.operations.summarize_hed_tags_op import SummarizeHedTagsOp
Expand All @@ -36,6 +37,7 @@
'split_rows': SplitRowsOp,
'summarize_column_names': SummarizeColumnNamesOp,
'summarize_column_values': SummarizeColumnValuesOp,
'summarize_definitions': SummarizeDefinitionsOp,
'summarize_sidecar_from_events': SummarizeSidecarFromEventsOp,
'summarize_hed_type': SummarizeHedTypeOp,
'summarize_hed_tags': SummarizeHedTagsOp,
Expand Down
18 changes: 18 additions & 0 deletions tests/tools/bids/test_bids_tabular_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,24 @@ def test_report_diffs_diff_rows(self):
self.assertTrue(output, "report_diffs has differences")
self.assertTrue(logger.log, "report_diffs the logger is empty before report is called")

def test_with_tabular_summary(self):
from hed.tools.analysis.tabular_summary import TabularSummary
bids_root_path = os.path.realpath('../../data/bids_tests/eeg_ds003645s_hed')
name = 'eeg_ds003645s_hed'
exclude_dirs = ['stimuli']
entities = ('sub', 'run')
skip_columns = ["onset", "duration", "sample", "stim_file", "trial", "response_time"]

# Construct the file dictionary for the BIDS event files
event_files = get_file_list(bids_root_path, extensions=[".tsv"], name_suffix="_events",
exclude_dirs=exclude_dirs)
bids_tab = BidsTabularDictionary(name, event_files, entities=entities)

# Create a summary of the original BIDS events file content
bids_dicts_all, bids_dicts = TabularSummary.make_combined_dicts(bids_tab.file_dict, skip_cols=skip_columns)
self.assertIsInstance(bids_dicts, dict)
self.assertEqual(len(bids_dicts), len(event_files))


if __name__ == '__main__':
unittest.main()
60 changes: 60 additions & 0 deletions tests/tools/remodeling/operations/test_summarize_definitions_op.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import json
import os
import unittest
import pandas as pd
from hed.models.df_util import get_assembled
from hed.tools.remodeling.dispatcher import Dispatcher
from hed.tools.remodeling.operations.summarize_definitions_op import SummarizeDefinitionsOp, DefinitionSummaryContext


class Test(unittest.TestCase):

@classmethod
def setUpClass(cls):
path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../../data/remodel_tests/'))
cls.data_path = os.path.realpath(os.path.join(path, 'sub-002_task-FacePerception_run-1_events.tsv'))
cls.json_path = os.path.realpath(os.path.join(path, 'task-FacePerception_events.json'))
base_parameters = {
"summary_name": 'get_definition_summary',
"summary_filename": 'summarize_definitions'
}
cls.json_parms = json.dumps(base_parameters)

@classmethod
def tearDownClass(cls):
pass

def test_constructor(self):
parms = json.loads(self.json_parms)
sum_op1 = SummarizeDefinitionsOp(parms)
self.assertIsInstance(sum_op1, SummarizeDefinitionsOp, "constructor creates an object of the correct type")
parms["expand_context"] = ""
with self.assertRaises(KeyError) as context:
SummarizeDefinitionsOp(parms)
self.assertEqual(context.exception.args[0], "BadParameter")
parms2 = json.loads(self.json_parms)
parms2["mystery"] = True
with self.assertRaises(KeyError) as context:
SummarizeDefinitionsOp(parms2)
self.assertEqual(context.exception.args[0], "BadParameter")

def test_do_op(self):
dispatch = Dispatcher([], data_root=None, backup_name=None, hed_versions=['8.1.0'])
parms = json.loads(self.json_parms)
sum_op = SummarizeDefinitionsOp(parms)
self.assertIsInstance(sum_op, SummarizeDefinitionsOp, "constructor creates an object of the correct type")
df = pd.read_csv(self.data_path, delimiter='\t', header=0, keep_default_na=False, na_values=",null")
df_new = sum_op.do_op(dispatch, dispatch.prep_data(df), 'subj2_run1', sidecar=self.json_path)
self.assertEqual(200, len(df_new), "summarize_hed_type_op dataframe length is correct")
self.assertEqual(10, len(df_new.columns), "summarize_hed_type_op has correct number of columns")
self.assertIn(sum_op.summary_name, dispatch.context_dict)
self.assertIsInstance(dispatch.context_dict[sum_op.summary_name], DefinitionSummaryContext)
# x = dispatch.context_dict[sum_op.summary_name].summary_dict['subj2_run1']
# self.assertEqual(len(dispatch.context_dict[sum_op.summary_name].summary_dict['subj2_run1'].tag_dict), 47)
# df_new = sum_op.do_op(dispatch, dispatch.prep_data(df), 'subj2_run2', sidecar=self.json_path)
# self.assertEqual(len(dispatch.context_dict[sum_op.summary_name].summary_dict['subj2_run2'].tag_dict), 47)


if __name__ == '__main__':
unittest.main()
12 changes: 0 additions & 12 deletions tests/tools/remodeling/operations/test_summarize_hed_tags_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,18 +65,6 @@ def test_do_op(self):
df_new = sum_op.do_op(dispatch, dispatch.prep_data(df), 'subj2_run2', sidecar=self.json_path)
self.assertEqual(len(dispatch.context_dict[sum_op.summary_name].summary_dict['subj2_run2'].tag_dict), 47)

def test_quick_test(self):
from hed.models.hed_tag import HedTag
from hed.schema import load_schema_version
my_tag = "Description/This is a test"
tag = HedTag(my_tag)
x = tag.tag_terms
# print(x)
my_schema = load_schema_version('8.1.0')
tag1 = HedTag(my_tag, hed_schema=my_schema)
x1 = tag1.tag_terms
# print(x1)

def test_quick3(self):
from hed.models import TabularInput, Sidecar
from hed.schema import load_schema_version
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ def test_get_summary_text_summary(self):

sum_context1 = dispatch.context_dict[sum_op.summary_name]
text_sum1 = sum_context1.get_text_summary(individual_summaries="separate")
# print(text_sum1)
sum_op.do_op(dispatch, df, 'subj2_run2', sidecar=self.json_path)
sum_op.do_op(dispatch, df, 'subj2_run3', sidecar=self.bad_json_path)
text_sum2 = sum_context1.get_text_summary(individual_summaries="none")
Expand Down