Skip to content

Commit

Permalink
Merge pull request #647 from VisLab/develop
Browse files Browse the repository at this point in the history
Updated docs on summary ops
  • Loading branch information
VisLab authored Apr 6, 2023
2 parents 7e29789 + c8b5eb1 commit cd6d8cd
Show file tree
Hide file tree
Showing 13 changed files with 513 additions and 55 deletions.
2 changes: 1 addition & 1 deletion docs/source/_templates/layout.html
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
<li class="toctree-li"><a href="https://www.hed-resources.org/en/latest/">Main resource page</a></li>
<li class="toctree-li"><a href="https://www.hedtags.org">Project home page</a></li>
<li class="toctree-li"><a href="https://www.hedtags.org/display_hed.html">Schema viewer</a></li>
<li class="toctree-li"><a href="https://www.hedtags.org/display_hed_library.html">Library schema viewer</a></li>
<li class="toctree-li"><a href="https://www.hedtags.org/display_hed_prerelease.html">Prerelease schema viewer</a></li>
<li class="toctree-li"><a href="https://hed-specification.readthedocs.io/en/latest/index.html">Specification</a></li>
<li class="toctree-li"><a href="https://hed-examples.readthedocs.io/en/latest/">Examples and tutorials</a></li>
<li class="toctree-li"><a href="https://hedtools.ucsd.edu/hed">Online tools</a></li>
Expand Down
79 changes: 72 additions & 7 deletions hed/tools/remodeling/operations/base_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,49 @@ def __init__(self, context_type, context_name, context_filename):
self.summary_dict = {}

def get_summary_details(self, include_individual=True):
""" Return a dictionary with the details for individual files and the overall dataset.
Parameters:
include_individual (bool): If True, summaries for individual files are included.
Returns:
dict - a dictionary with 'Dataset' and 'Individual files' keys.
Notes:
- The 'Dataset' value is either a string or a dictionary with the overall summary.
- The 'Individual files' value is dictionary whose keys are file names and values are
their corresponding summaries.
Users are expected to provide _merge_all and _get_details_dict to support this.
"""
merged_summary = self._merge_all()
if merged_summary:
details = self._get_summary_details(merged_summary)
details = self._get_details_dict(merged_summary)
else:
details = "Overall summary unavailable"

summary_details = {"Dataset": details, "Individual files": {}}
if include_individual:
for name, count in self.summary_dict.items():
summary_details["Individual files"][name] = self._get_summary_details(count)
summary_details["Individual files"][name] = self._get_details_dict(count)
return summary_details

def get_summary(self, individual_summaries="separate"):
""" Return a summary dictionary with the information.
Parameters:
individual_summaries (str): "separate", "consolidated", or "none"
Returns:
dict - dictionary with "Dataset" and "Individual files" keys.
Notes: The individual_summaries value is processed as follows
- "separate" individual summaries are to be in separate files
- "consolidated" means that the individual summaries are in same file as overall summary
- "none" means that only the overall summary is produced.
"""
include_individual = individual_summaries == "separate" or individual_summaries == "consolidated"
summary_details = self.get_summary_details(include_individual=include_individual)
dataset_summary = {"Context name": self.context_name, "Context type": self.context_type,
Expand Down Expand Up @@ -99,9 +128,17 @@ def save(self, save_dir, file_formats=['.txt'], individual_summaries="separate")
summary = self.get_summary(individual_summaries=individual_summaries)
else:
continue
self._save_separate(save_dir, file_format, summary, individual_summaries)
self._save_summary_files(save_dir, file_format, summary, individual_summaries)

def _save_separate(self, save_dir, file_format, summary, individual_summaries):
def _save_summary_files(self, save_dir, file_format, summary, individual_summaries):
""" Save the files in the appropriate format.
Parameters:
save_dir (str): Path to the directory in which the summaries will be saved.
file_format (str): string representing the extension (including .), '.txt' or '.json'.
summary (dictionary): Dictionary of summaries (has "Dataset" and "Individual files" keys.
"""
time_stamp = '_' + get_timestamp()
this_save = os.path.join(save_dir, self.context_name + '/')
os.makedirs(os.path.realpath(this_save), exist_ok=True)
Expand All @@ -117,10 +154,21 @@ def _save_separate(self, save_dir, file_format, summary, individual_summaries):
individual_dir = os.path.join(this_save, self.INDIVIDUAL_SUMMARIES_PATH + '/')
os.makedirs(os.path.realpath(individual_dir), exist_ok=True)
for name, sum_str in individual.items():
filename = self._get_individual_filename(individual_dir, name, time_stamp, file_format)
filename = self._get_summary_filepath(individual_dir, name, time_stamp, file_format)
self.dump_summary(filename, sum_str)

def _get_individual_filename(self, individual_dir, name, time_stamp, file_format):
def _get_summary_filepath(self, individual_dir, name, time_stamp, file_format):
""" Return the filepath for the summary including the timestamp
Parameters:
individual_dir (str): path of the directory in which the summary should be stored.
name (str): Path of the original file from which the summary was extracted.
time_stamp (str): Formatted date-time string to be included in the filename of the summary.
Returns:
str: Full path name of the summary.
"""
this_name = os.path.basename(name)
this_name = os.path.splitext(this_name)[0]
count = 1
Expand All @@ -135,6 +183,20 @@ def _get_individual_filename(self, individual_dir, name, time_stamp, file_format
return filename

def _get_result_string(self, name, result, indent=DISPLAY_INDENT):
""" Return a formatted string with the summary for the indicated name.
Parameters:
name (str): Identifier (usually the filename) of the individual file.
result (dict): The dictionary of the summary results indexed by name.
indent (str): A string containing spaces used for indentation (usually 3 spaces).
Returns:
str - The results in a printable format ready to be saved to a text file.
Notes:
This file should be overridden by each summary.
"""
return f"\n{name}\n{indent}{str(result)}"

@staticmethod
Expand All @@ -145,12 +207,15 @@ def dump_summary(filename, summary):
text_file.write(summary)

@abstractmethod
def _get_summary_details(self, summary_info):
def _get_details_dict(self, summary_info):
""" Return the summary-specific information.
Parameters:
summary_info (object): Summary to return info from
Returns:
dict: dictionary with the results.
Notes:
Abstract method be implemented by each individual context summary.
Expand Down
51 changes: 50 additions & 1 deletion hed/tools/remodeling/operations/summarize_column_names_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,29 +78,78 @@ def __init__(self, sum_op):
super().__init__(sum_op.SUMMARY_TYPE, sum_op.summary_name, sum_op.summary_filename)

def update_context(self, new_context):
""" Update the summary for a given tabular input file.
Parameters:
new_context (dict): A dictionary with the parameters needed to update a summary.
Notes:
- The summary information is kept in separate ColumnNameSummary objects for each file.
- The summary needs a "name" str and a "column_names" list.
- The summary uses ColumnNameSummary as the summary object.
"""
name = new_context['name']
if name not in self.summary_dict:
self.summary_dict[name] = ColumnNameSummary(name=name)
self.summary_dict[name].update(name, new_context["column_names"])

def _get_summary_details(self, column_summary):
def _get_details_dict(self, column_summary):
""" Return the summary dictionary extracted from a ColumnNameSummary.
Parameters:
column_summary (ColumnNameSummary): A column name summary for the data file.
Returns:
dict - a dictionary with the summary information for column names.
"""
return column_summary.get_summary()

def _merge_all(self):
""" Create a ColumnNameSummary containing the overall dataset summary.
Returns:
ColumnNameSummary - the overall summary object for column names.
"""
all_sum = ColumnNameSummary(name='Dataset')
for key, counts in self.summary_dict.items():
for name, pos in counts.file_dict.items():
all_sum.update(name, counts.unique_headers[pos])
return all_sum

def _get_result_string(self, name, result, indent=BaseContext.DISPLAY_INDENT):
""" Return a formatted string with the summary for the indicated name.
Parameters:
name (str): Identifier (usually the filename) of the individual file.
result (dict): The dictionary of the summary results indexed by name.
indent (str): A string containing spaces used for indentation (usually 3 spaces).
Returns:
str - The results in a printable format ready to be saved to a text file.
Notes:
This calls _get_dataset_string to get the overall summary string.
"""
if name == "Dataset":
return self._get_dataset_string(result, indent)
columns = result["Columns"][0]
return f"{indent}{str(columns['Column names'])}"

@staticmethod
def _get_dataset_string(result, indent=BaseContext.DISPLAY_INDENT):
""" Return a string with the overall summary for all of the tabular files.
Parameters:
result (dict): Dictionary of merged summary information.
indent (str): String of blanks used as the amount to indent for readability.
Returns:
str: Formatted string suitable for saving in a file or printing.
"""
sum_list = [f"Dataset: Number of files={result.get('Number files', 0)}"]
for element in result.get("Columns", []):
sum_list.append(f"{indent}Columns: {str(element['Column names'])}")
Expand Down
67 changes: 64 additions & 3 deletions hed/tools/remodeling/operations/summarize_column_values_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,28 +88,79 @@ def __init__(self, sum_op):
self.skip_columns = sum_op.skip_columns

def update_context(self, new_context):
""" Update the summary for a given tabular input file.
Parameters:
new_context (dict): A dictionary with the parameters needed to update a summary.
Notes:
- The summary information is kept in separate TabularSummary objects for each file.
- The summary needs a "name" str and a "df" .
"""
name = new_context['name']
if name not in self.summary_dict:
self.summary_dict[name] = \
TabularSummary(value_cols=self.value_columns, skip_cols=self.skip_columns, name=name)
self.summary_dict[name].update(new_context['df'])

def _get_summary_details(self, summary):
def _get_details_dict(self, summary):
""" Return a dictionary with the summary contained in a TabularSummary
Parameters:
summary (TabularSummary): Dictionary of merged summary information.
Returns:
dict: Dictionary with the information suitable for extracting printout.
"""
return summary.get_summary(as_json=False)

def _merge_all(self):
""" Create a TabularSummary containing the overall dataset summary.
Returns:
TabularSummary - the summary object for column values.
"""
all_sum = TabularSummary(value_cols=self.value_columns, skip_cols=self.skip_columns, name='Dataset')
for key, counts in self.summary_dict.items():
all_sum.update_summary(counts)
return all_sum

def _get_result_string(self, name, result, indent=BaseContext.DISPLAY_INDENT):
""" Return a formatted string with the summary for the indicated name.
Parameters:
name (str): Identifier (usually the filename) of the individual file.
result (dict): The dictionary of the summary results indexed by name.
indent (str): A string containing spaces used for indentation (usually 3 spaces).
Returns:
str - The results in a printable format ready to be saved to a text file.
Notes:
This calls _get_dataset_string to get the overall summary string and
_get_individual_string to get an individual summary string.
"""

if name == "Dataset":
return self._get_dataset_string(result, indent=indent)
return self._get_individual_string(name, result, indent=indent)
return self._get_individual_string(result, indent=indent)

@staticmethod
def _get_dataset_string(result, indent=BaseContext.DISPLAY_INDENT):
""" Return a string with the overall summary for all of the tabular files.
Parameters:
result (dict): Dictionary of merged summary information.
indent (str): String of blanks used as the amount to indent for readability.
Returns:
str: Formatted string suitable for saving in a file or printing.
"""
sum_list = [f"Dataset: Total events={result.get('Total events', 0)} "
f"Total files={result.get('Total files', 0)}"]
cat_cols = result.get("Categorical columns", {})
Expand All @@ -121,7 +172,17 @@ def _get_dataset_string(result, indent=BaseContext.DISPLAY_INDENT):
return "\n".join(sum_list)

@staticmethod
def _get_individual_string(name, result, indent=BaseContext.DISPLAY_INDENT):
def _get_individual_string(result, indent=BaseContext.DISPLAY_INDENT):
""" Return a string with the summary for an individual tabular file.
Parameters:
result (dict): Dictionary of summary information for a particular tabular file.
indent (str): String of blanks used as the amount to indent for readability.
Returns:
str: Formatted string suitable for saving in a file or printing.
"""
sum_list = [f"Total events={result.get('Total events', 0)}"]
cat_cols = result.get("Categorical columns", {})
if cat_cols:
Expand Down
Loading

0 comments on commit cd6d8cd

Please sign in to comment.