Merge pull request #647 from VisLab/develop

Updated docs on summary ops
hed-standard · Apr 6, 2023 · cd6d8cd · cd6d8cd
2 parents 7e29789 + c8b5eb1
commit cd6d8cd
Show file tree

Hide file tree

Showing 13 changed files with 513 additions and 55 deletions.
diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html
@@ -12,7 +12,7 @@
       <li class="toctree-li"><a href="https://www.hed-resources.org/en/latest/">Main resource page</a></li>
       <li class="toctree-li"><a href="https://www.hedtags.org">Project home page</a></li>
       <li class="toctree-li"><a href="https://www.hedtags.org/display_hed.html">Schema viewer</a></li>
-      <li class="toctree-li"><a href="https://www.hedtags.org/display_hed_library.html">Library schema viewer</a></li>
+      <li class="toctree-li"><a href="https://www.hedtags.org/display_hed_prerelease.html">Prerelease schema viewer</a></li>
       <li class="toctree-li"><a href="https://hed-specification.readthedocs.io/en/latest/index.html">Specification</a></li>
       <li class="toctree-li"><a href="https://hed-examples.readthedocs.io/en/latest/">Examples and tutorials</a></li>
       <li class="toctree-li"><a href="https://hedtools.ucsd.edu/hed">Online tools</a></li>

diff --git a/hed/tools/remodeling/operations/base_context.py b/hed/tools/remodeling/operations/base_context.py
@@ -26,20 +26,49 @@ def __init__(self, context_type, context_name, context_filename):
         self.summary_dict = {}
 
     def get_summary_details(self, include_individual=True):
+        """ Return a dictionary with the details for individual files and the overall dataset.
+
+        Parameters:
+            include_individual (bool):  If True, summaries for individual files are included.
+
+        Returns:
+            dict - a dictionary with 'Dataset' and 'Individual files' keys.
+
+        Notes:
+            - The 'Dataset' value is either a string or a dictionary with the overall summary.
+            - The 'Individual files' value is dictionary whose keys are file names and values are
+                   their corresponding summaries.
+
+        Users are expected to provide _merge_all and _get_details_dict to support this.
+
+        """
         merged_summary = self._merge_all()
         if merged_summary:
-            details = self._get_summary_details(merged_summary)
+            details = self._get_details_dict(merged_summary)
         else:
             details = "Overall summary unavailable"
 
         summary_details = {"Dataset": details, "Individual files": {}}
         if include_individual:
             for name, count in self.summary_dict.items():
-                summary_details["Individual files"][name] = self._get_summary_details(count)
+                summary_details["Individual files"][name] = self._get_details_dict(count)
         return summary_details
 
     def get_summary(self, individual_summaries="separate"):
+        """ Return a summary dictionary with the information.
+
+        Parameters:
+            individual_summaries (str): "separate", "consolidated", or "none"
 
+        Returns:
+            dict - dictionary with "Dataset" and "Individual files" keys.
+            
+        Notes: The individual_summaries value is processed as follows
+           -  "separate" individual summaries are to be in separate files
+           -  "consolidated" means that the individual summaries are in same file as overall summary
+           -  "none" means that only the overall summary is produced.
+            
+        """
         include_individual = individual_summaries == "separate" or individual_summaries == "consolidated"
         summary_details = self.get_summary_details(include_individual=include_individual)
         dataset_summary = {"Context name": self.context_name, "Context type": self.context_type,
@@ -99,9 +128,17 @@ def save(self, save_dir, file_formats=['.txt'], individual_summaries="separate")
                 summary = self.get_summary(individual_summaries=individual_summaries)
             else:
                 continue
-            self._save_separate(save_dir, file_format, summary, individual_summaries)
+            self._save_summary_files(save_dir, file_format, summary, individual_summaries)
 
-    def _save_separate(self, save_dir, file_format, summary, individual_summaries):
+    def _save_summary_files(self, save_dir, file_format, summary, individual_summaries):
+        """ Save the files in the appropriate format.
+        
+        Parameters:
+            save_dir (str): Path to the directory in which the summaries will be saved.
+            file_format (str): string representing the extension (including .), '.txt' or '.json'.
+            summary (dictionary): Dictionary of summaries (has "Dataset" and "Individual files" keys.
+        
+        """
         time_stamp = '_' + get_timestamp()
         this_save = os.path.join(save_dir, self.context_name + '/')
         os.makedirs(os.path.realpath(this_save), exist_ok=True)
@@ -117,10 +154,21 @@ def _save_separate(self, save_dir, file_format, summary, individual_summaries):
         individual_dir = os.path.join(this_save, self.INDIVIDUAL_SUMMARIES_PATH + '/')
         os.makedirs(os.path.realpath(individual_dir), exist_ok=True)
         for name, sum_str in individual.items():
-            filename = self._get_individual_filename(individual_dir, name, time_stamp, file_format)
+            filename = self._get_summary_filepath(individual_dir, name, time_stamp, file_format)
             self.dump_summary(filename, sum_str)
 
-    def _get_individual_filename(self, individual_dir, name, time_stamp, file_format):
+    def _get_summary_filepath(self, individual_dir, name, time_stamp, file_format):
+        """ Return the filepath for the summary including the timestamp
+        
+        Parameters:
+            individual_dir (str):  path of the directory in which the summary should be stored.
+            name (str): Path of the original file from which the summary was extracted.
+            time_stamp (str):  Formatted date-time string to be included in the filename of the summary.
+
+        Returns:
+            str: Full path name of the summary.
+
+        """
         this_name = os.path.basename(name)
         this_name = os.path.splitext(this_name)[0]
         count = 1
@@ -135,6 +183,20 @@ def _get_individual_filename(self, individual_dir, name, time_stamp, file_format
         return filename
 
     def _get_result_string(self, name, result, indent=DISPLAY_INDENT):
+        """ Return a formatted string with the summary for the indicated name.
+
+        Parameters:
+            name (str):  Identifier (usually the filename) of the individual file.
+            result (dict): The dictionary of the summary results indexed by name.
+            indent (str): A string containing spaces used for indentation (usually 3 spaces).
+
+        Returns:
+            str - The results in a printable format ready to be saved to a text file.
+
+        Notes:
+            This file should be overridden by each summary.
+
+        """
         return f"\n{name}\n{indent}{str(result)}"
 
     @staticmethod
@@ -145,12 +207,15 @@ def dump_summary(filename, summary):
             text_file.write(summary)
 
     @abstractmethod
-    def _get_summary_details(self, summary_info):
+    def _get_details_dict(self, summary_info):
         """ Return the summary-specific information.
 
         Parameters:
             summary_info (object):  Summary to return info from
 
+        Returns:
+            dict: dictionary with the results.
+
         Notes:
             Abstract method be implemented by each individual context summary.
 

diff --git a/hed/tools/remodeling/operations/summarize_column_names_op.py b/hed/tools/remodeling/operations/summarize_column_names_op.py
@@ -78,29 +78,78 @@ def __init__(self, sum_op):
         super().__init__(sum_op.SUMMARY_TYPE, sum_op.summary_name, sum_op.summary_filename)
 
     def update_context(self, new_context):
+        """ Update the summary for a given tabular input file.
+
+        Parameters:
+            new_context (dict):  A dictionary with the parameters needed to update a summary.
+
+        Notes:
+            - The summary information is kept in separate ColumnNameSummary objects for each file.  
+            - The summary needs a "name" str and a "column_names" list.  
+            - The summary uses ColumnNameSummary as the summary object.
+        """
         name = new_context['name']
         if name not in self.summary_dict:
             self.summary_dict[name] = ColumnNameSummary(name=name)
         self.summary_dict[name].update(name, new_context["column_names"])
 
-    def _get_summary_details(self, column_summary):
+    def _get_details_dict(self, column_summary):
+        """ Return the summary dictionary extracted from a ColumnNameSummary.
+
+        Parameters:
+            column_summary (ColumnNameSummary):  A column name summary for the data file.
+
+        Returns:
+            dict - a dictionary with the summary information for column names.
+
+        """
         return column_summary.get_summary()
 
     def _merge_all(self):
+        """ Create a ColumnNameSummary containing the overall dataset summary.
+
+        Returns:
+            ColumnNameSummary - the overall summary object for column names.
+
+        """
         all_sum = ColumnNameSummary(name='Dataset')
         for key, counts in self.summary_dict.items():
             for name, pos in counts.file_dict.items():
                 all_sum.update(name, counts.unique_headers[pos])
         return all_sum
 
     def _get_result_string(self, name, result, indent=BaseContext.DISPLAY_INDENT):
+        """ Return a formatted string with the summary for the indicated name.
+
+        Parameters:
+            name (str):  Identifier (usually the filename) of the individual file.
+            result (dict): The dictionary of the summary results indexed by name.
+            indent (str): A string containing spaces used for indentation (usually 3 spaces).
+
+        Returns:
+            str - The results in a printable format ready to be saved to a text file.
+
+        Notes:
+            This calls _get_dataset_string to get the overall summary string.
+
+        """
         if name == "Dataset":
             return self._get_dataset_string(result, indent)
         columns = result["Columns"][0]
         return f"{indent}{str(columns['Column names'])}"
 
     @staticmethod
     def _get_dataset_string(result, indent=BaseContext.DISPLAY_INDENT):
+        """ Return  a string with the overall summary for all of the tabular files.
+
+        Parameters:
+            result (dict): Dictionary of merged summary information.
+            indent (str):  String of blanks used as the amount to indent for readability.
+
+        Returns:
+            str: Formatted string suitable for saving in a file or printing.
+
+        """
         sum_list = [f"Dataset: Number of files={result.get('Number files', 0)}"]
         for element in result.get("Columns", []):
             sum_list.append(f"{indent}Columns: {str(element['Column names'])}")

diff --git a/hed/tools/remodeling/operations/summarize_column_values_op.py b/hed/tools/remodeling/operations/summarize_column_values_op.py
@@ -88,28 +88,79 @@ def __init__(self, sum_op):
         self.skip_columns = sum_op.skip_columns
 
     def update_context(self, new_context):
+        """ Update the summary for a given tabular input file.
+
+        Parameters:
+            new_context (dict):  A dictionary with the parameters needed to update a summary.
+
+        Notes:
+            - The summary information is kept in separate TabularSummary objects for each file.  
+            - The summary needs a "name" str and a "df" .  
+
+        """
         name = new_context['name']
         if name not in self.summary_dict:
             self.summary_dict[name] = \
                 TabularSummary(value_cols=self.value_columns, skip_cols=self.skip_columns, name=name)
         self.summary_dict[name].update(new_context['df'])
 
-    def _get_summary_details(self, summary):
+    def _get_details_dict(self, summary):
+        """ Return a dictionary with the summary contained in a TabularSummary
+
+        Parameters:
+            summary (TabularSummary): Dictionary of merged summary information.
+
+        Returns:
+            dict: Dictionary with the information suitable for extracting printout.
+
+        """
         return summary.get_summary(as_json=False)
 
     def _merge_all(self):
+        """ Create a TabularSummary containing the overall dataset summary.
+
+        Returns:
+            TabularSummary - the summary object for column values.
+
+        """
         all_sum = TabularSummary(value_cols=self.value_columns, skip_cols=self.skip_columns, name='Dataset')
         for key, counts in self.summary_dict.items():
             all_sum.update_summary(counts)
         return all_sum
 
     def _get_result_string(self, name, result, indent=BaseContext.DISPLAY_INDENT):
+        """ Return a formatted string with the summary for the indicated name.
+
+        Parameters:
+            name (str):  Identifier (usually the filename) of the individual file.
+            result (dict): The dictionary of the summary results indexed by name.
+            indent (str): A string containing spaces used for indentation (usually 3 spaces).
+
+        Returns:
+            str - The results in a printable format ready to be saved to a text file.
+
+        Notes:
+            This calls _get_dataset_string to get the overall summary string and
+            _get_individual_string to get an individual summary string.
+
+        """
+
         if name == "Dataset":
             return self._get_dataset_string(result, indent=indent)
-        return self._get_individual_string(name, result, indent=indent)
+        return self._get_individual_string(result, indent=indent)
 
     @staticmethod
     def _get_dataset_string(result, indent=BaseContext.DISPLAY_INDENT):
+        """ Return  a string with the overall summary for all of the tabular files.
+
+        Parameters:
+            result (dict): Dictionary of merged summary information.
+            indent (str):  String of blanks used as the amount to indent for readability.
+
+        Returns:
+            str: Formatted string suitable for saving in a file or printing.
+
+        """
         sum_list = [f"Dataset: Total events={result.get('Total events', 0)} "
                     f"Total files={result.get('Total files', 0)}"]
         cat_cols = result.get("Categorical columns", {})
@@ -121,7 +172,17 @@ def _get_dataset_string(result, indent=BaseContext.DISPLAY_INDENT):
         return "\n".join(sum_list)
 
     @staticmethod
-    def _get_individual_string(name, result, indent=BaseContext.DISPLAY_INDENT):
+    def _get_individual_string(result, indent=BaseContext.DISPLAY_INDENT):
+        """ Return  a string with the summary for an individual tabular file.
+
+        Parameters:
+            result (dict): Dictionary of summary information for a particular tabular file.
+            indent (str):  String of blanks used as the amount to indent for readability.
+
+        Returns:
+            str: Formatted string suitable for saving in a file or printing.
+
+        """
         sum_list = [f"Total events={result.get('Total events', 0)}"]
         cat_cols = result.get("Categorical columns", {})
         if cat_cols: