Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated docs on summary ops #647

Merged
merged 1 commit into from
Apr 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/_templates/layout.html
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
<li class="toctree-li"><a href="https://www.hed-resources.org/en/latest/">Main resource page</a></li>
<li class="toctree-li"><a href="https://www.hedtags.org">Project home page</a></li>
<li class="toctree-li"><a href="https://www.hedtags.org/display_hed.html">Schema viewer</a></li>
<li class="toctree-li"><a href="https://www.hedtags.org/display_hed_library.html">Library schema viewer</a></li>
<li class="toctree-li"><a href="https://www.hedtags.org/display_hed_prerelease.html">Prerelease schema viewer</a></li>
<li class="toctree-li"><a href="https://hed-specification.readthedocs.io/en/latest/index.html">Specification</a></li>
<li class="toctree-li"><a href="https://hed-examples.readthedocs.io/en/latest/">Examples and tutorials</a></li>
<li class="toctree-li"><a href="https://hedtools.ucsd.edu/hed">Online tools</a></li>
Expand Down
79 changes: 72 additions & 7 deletions hed/tools/remodeling/operations/base_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,49 @@ def __init__(self, context_type, context_name, context_filename):
self.summary_dict = {}

def get_summary_details(self, include_individual=True):
""" Return a dictionary with the details for individual files and the overall dataset.

Parameters:
include_individual (bool): If True, summaries for individual files are included.

Returns:
dict - a dictionary with 'Dataset' and 'Individual files' keys.

Notes:
- The 'Dataset' value is either a string or a dictionary with the overall summary.
- The 'Individual files' value is dictionary whose keys are file names and values are
their corresponding summaries.

Users are expected to provide _merge_all and _get_details_dict to support this.

"""
merged_summary = self._merge_all()
if merged_summary:
details = self._get_summary_details(merged_summary)
details = self._get_details_dict(merged_summary)
else:
details = "Overall summary unavailable"

summary_details = {"Dataset": details, "Individual files": {}}
if include_individual:
for name, count in self.summary_dict.items():
summary_details["Individual files"][name] = self._get_summary_details(count)
summary_details["Individual files"][name] = self._get_details_dict(count)
return summary_details

def get_summary(self, individual_summaries="separate"):
""" Return a summary dictionary with the information.

Parameters:
individual_summaries (str): "separate", "consolidated", or "none"

Returns:
dict - dictionary with "Dataset" and "Individual files" keys.

Notes: The individual_summaries value is processed as follows
- "separate" individual summaries are to be in separate files
- "consolidated" means that the individual summaries are in same file as overall summary
- "none" means that only the overall summary is produced.

"""
include_individual = individual_summaries == "separate" or individual_summaries == "consolidated"
summary_details = self.get_summary_details(include_individual=include_individual)
dataset_summary = {"Context name": self.context_name, "Context type": self.context_type,
Expand Down Expand Up @@ -99,9 +128,17 @@ def save(self, save_dir, file_formats=['.txt'], individual_summaries="separate")
summary = self.get_summary(individual_summaries=individual_summaries)
else:
continue
self._save_separate(save_dir, file_format, summary, individual_summaries)
self._save_summary_files(save_dir, file_format, summary, individual_summaries)

def _save_separate(self, save_dir, file_format, summary, individual_summaries):
def _save_summary_files(self, save_dir, file_format, summary, individual_summaries):
""" Save the files in the appropriate format.

Parameters:
save_dir (str): Path to the directory in which the summaries will be saved.
file_format (str): string representing the extension (including .), '.txt' or '.json'.
summary (dictionary): Dictionary of summaries (has "Dataset" and "Individual files" keys.

"""
time_stamp = '_' + get_timestamp()
this_save = os.path.join(save_dir, self.context_name + '/')
os.makedirs(os.path.realpath(this_save), exist_ok=True)
Expand All @@ -117,10 +154,21 @@ def _save_separate(self, save_dir, file_format, summary, individual_summaries):
individual_dir = os.path.join(this_save, self.INDIVIDUAL_SUMMARIES_PATH + '/')
os.makedirs(os.path.realpath(individual_dir), exist_ok=True)
for name, sum_str in individual.items():
filename = self._get_individual_filename(individual_dir, name, time_stamp, file_format)
filename = self._get_summary_filepath(individual_dir, name, time_stamp, file_format)
self.dump_summary(filename, sum_str)

def _get_individual_filename(self, individual_dir, name, time_stamp, file_format):
def _get_summary_filepath(self, individual_dir, name, time_stamp, file_format):
""" Return the filepath for the summary including the timestamp

Parameters:
individual_dir (str): path of the directory in which the summary should be stored.
name (str): Path of the original file from which the summary was extracted.
time_stamp (str): Formatted date-time string to be included in the filename of the summary.

Returns:
str: Full path name of the summary.

"""
this_name = os.path.basename(name)
this_name = os.path.splitext(this_name)[0]
count = 1
Expand All @@ -135,6 +183,20 @@ def _get_individual_filename(self, individual_dir, name, time_stamp, file_format
return filename

def _get_result_string(self, name, result, indent=DISPLAY_INDENT):
""" Return a formatted string with the summary for the indicated name.

Parameters:
name (str): Identifier (usually the filename) of the individual file.
result (dict): The dictionary of the summary results indexed by name.
indent (str): A string containing spaces used for indentation (usually 3 spaces).

Returns:
str - The results in a printable format ready to be saved to a text file.

Notes:
This file should be overridden by each summary.

"""
return f"\n{name}\n{indent}{str(result)}"

@staticmethod
Expand All @@ -145,12 +207,15 @@ def dump_summary(filename, summary):
text_file.write(summary)

@abstractmethod
def _get_summary_details(self, summary_info):
def _get_details_dict(self, summary_info):
""" Return the summary-specific information.

Parameters:
summary_info (object): Summary to return info from

Returns:
dict: dictionary with the results.

Notes:
Abstract method be implemented by each individual context summary.

Expand Down
51 changes: 50 additions & 1 deletion hed/tools/remodeling/operations/summarize_column_names_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,29 +78,78 @@ def __init__(self, sum_op):
super().__init__(sum_op.SUMMARY_TYPE, sum_op.summary_name, sum_op.summary_filename)

def update_context(self, new_context):
""" Update the summary for a given tabular input file.

Parameters:
new_context (dict): A dictionary with the parameters needed to update a summary.

Notes:
- The summary information is kept in separate ColumnNameSummary objects for each file.
- The summary needs a "name" str and a "column_names" list.
- The summary uses ColumnNameSummary as the summary object.
"""
name = new_context['name']
if name not in self.summary_dict:
self.summary_dict[name] = ColumnNameSummary(name=name)
self.summary_dict[name].update(name, new_context["column_names"])

def _get_summary_details(self, column_summary):
def _get_details_dict(self, column_summary):
""" Return the summary dictionary extracted from a ColumnNameSummary.

Parameters:
column_summary (ColumnNameSummary): A column name summary for the data file.

Returns:
dict - a dictionary with the summary information for column names.

"""
return column_summary.get_summary()

def _merge_all(self):
""" Create a ColumnNameSummary containing the overall dataset summary.

Returns:
ColumnNameSummary - the overall summary object for column names.

"""
all_sum = ColumnNameSummary(name='Dataset')
for key, counts in self.summary_dict.items():
for name, pos in counts.file_dict.items():
all_sum.update(name, counts.unique_headers[pos])
return all_sum

def _get_result_string(self, name, result, indent=BaseContext.DISPLAY_INDENT):
""" Return a formatted string with the summary for the indicated name.

Parameters:
name (str): Identifier (usually the filename) of the individual file.
result (dict): The dictionary of the summary results indexed by name.
indent (str): A string containing spaces used for indentation (usually 3 spaces).

Returns:
str - The results in a printable format ready to be saved to a text file.

Notes:
This calls _get_dataset_string to get the overall summary string.

"""
if name == "Dataset":
return self._get_dataset_string(result, indent)
columns = result["Columns"][0]
return f"{indent}{str(columns['Column names'])}"

@staticmethod
def _get_dataset_string(result, indent=BaseContext.DISPLAY_INDENT):
""" Return a string with the overall summary for all of the tabular files.

Parameters:
result (dict): Dictionary of merged summary information.
indent (str): String of blanks used as the amount to indent for readability.

Returns:
str: Formatted string suitable for saving in a file or printing.

"""
sum_list = [f"Dataset: Number of files={result.get('Number files', 0)}"]
for element in result.get("Columns", []):
sum_list.append(f"{indent}Columns: {str(element['Column names'])}")
Expand Down
67 changes: 64 additions & 3 deletions hed/tools/remodeling/operations/summarize_column_values_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,28 +88,79 @@ def __init__(self, sum_op):
self.skip_columns = sum_op.skip_columns

def update_context(self, new_context):
""" Update the summary for a given tabular input file.

Parameters:
new_context (dict): A dictionary with the parameters needed to update a summary.

Notes:
- The summary information is kept in separate TabularSummary objects for each file.
- The summary needs a "name" str and a "df" .

"""
name = new_context['name']
if name not in self.summary_dict:
self.summary_dict[name] = \
TabularSummary(value_cols=self.value_columns, skip_cols=self.skip_columns, name=name)
self.summary_dict[name].update(new_context['df'])

def _get_summary_details(self, summary):
def _get_details_dict(self, summary):
""" Return a dictionary with the summary contained in a TabularSummary

Parameters:
summary (TabularSummary): Dictionary of merged summary information.

Returns:
dict: Dictionary with the information suitable for extracting printout.

"""
return summary.get_summary(as_json=False)

def _merge_all(self):
""" Create a TabularSummary containing the overall dataset summary.

Returns:
TabularSummary - the summary object for column values.

"""
all_sum = TabularSummary(value_cols=self.value_columns, skip_cols=self.skip_columns, name='Dataset')
for key, counts in self.summary_dict.items():
all_sum.update_summary(counts)
return all_sum

def _get_result_string(self, name, result, indent=BaseContext.DISPLAY_INDENT):
""" Return a formatted string with the summary for the indicated name.

Parameters:
name (str): Identifier (usually the filename) of the individual file.
result (dict): The dictionary of the summary results indexed by name.
indent (str): A string containing spaces used for indentation (usually 3 spaces).

Returns:
str - The results in a printable format ready to be saved to a text file.

Notes:
This calls _get_dataset_string to get the overall summary string and
_get_individual_string to get an individual summary string.

"""

if name == "Dataset":
return self._get_dataset_string(result, indent=indent)
return self._get_individual_string(name, result, indent=indent)
return self._get_individual_string(result, indent=indent)

@staticmethod
def _get_dataset_string(result, indent=BaseContext.DISPLAY_INDENT):
""" Return a string with the overall summary for all of the tabular files.

Parameters:
result (dict): Dictionary of merged summary information.
indent (str): String of blanks used as the amount to indent for readability.

Returns:
str: Formatted string suitable for saving in a file or printing.

"""
sum_list = [f"Dataset: Total events={result.get('Total events', 0)} "
f"Total files={result.get('Total files', 0)}"]
cat_cols = result.get("Categorical columns", {})
Expand All @@ -121,7 +172,17 @@ def _get_dataset_string(result, indent=BaseContext.DISPLAY_INDENT):
return "\n".join(sum_list)

@staticmethod
def _get_individual_string(name, result, indent=BaseContext.DISPLAY_INDENT):
def _get_individual_string(result, indent=BaseContext.DISPLAY_INDENT):
""" Return a string with the summary for an individual tabular file.

Parameters:
result (dict): Dictionary of summary information for a particular tabular file.
indent (str): String of blanks used as the amount to indent for readability.

Returns:
str: Formatted string suitable for saving in a file or printing.

"""
sum_list = [f"Total events={result.get('Total events', 0)}"]
cat_cols = result.get("Categorical columns", {})
if cat_cols:
Expand Down
Loading