Skip to content

Commit

Permalink
Merge pull request #565 from VisLab/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
VisLab committed Dec 20, 2022
2 parents 74ea684 + b67ae4d commit 9c428ea
Show file tree
Hide file tree
Showing 45 changed files with 1,045 additions and 239 deletions.
2 changes: 1 addition & 1 deletion hed/tools/analysis/column_name_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def get_summary(self, as_json=False):
column_headers = []
for index in range(len(patterns)):
column_headers.append({'Column names': self.unique_headers[index], 'Files': patterns[index]})
summary = {"Summary name": self.name, "Columns": column_headers}
summary = {"Summary name": self.name, "Columns": column_headers, "Number files": len(self.file_dict)}
if as_json:
return json.dumps(summary, indent=4)
else:
Expand Down
7 changes: 5 additions & 2 deletions hed/tools/analysis/hed_type_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def update_summary(self, type_sum, total_events=0, file_id=None):
Parameters:
type_sum (dict): Contains the information about the value of a type.
total_events (int): Total number of events processed.
file_id (str): Unique identifier for the associated file.
"""

Expand All @@ -109,7 +110,7 @@ def update_summary(self, type_sum, total_events=0, file_id=None):
val_counts = self.type_dict[type_val]
val_counts.update(type_counts, file_id)
self.files[file_id] = ''
self.total_events = self.total_events + 1
self.total_events = self.total_events + total_events

def add_descriptions(self, type_defs):
""" Update this summary based on the type variable map.
Expand All @@ -130,6 +131,7 @@ def add_descriptions(self, type_defs):
type_count.level_counts[level]['description'] = level_dict['description']

def update(self, counts):
self.total_events = self.total_events + counts.total_events
for key, count in counts.type_dict.items():
if key not in self.type_dict:
self.type_dict[key] = HedTypeCount(count.type_value, count.type_tag, None)
Expand All @@ -142,4 +144,5 @@ def get_summary(self):
details = {}
for type_value, count in self.type_dict.items():
details[type_value] = count.get_summary()
return {'name': str(self.name), 'type_tag': self.type_tag, 'files': list(self.files.keys()), 'details': details}
return {'name': str(self.name), 'type_tag': self.type_tag, 'files': list(self.files.keys()),
'total_events': self.total_events, 'details': details}
30 changes: 27 additions & 3 deletions hed/tools/analysis/key_map.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pandas as pd
from hed.errors.exceptions import HedFileError
from hed.tools.util.data_util import get_new_dataframe, get_row_hash, remove_quotes, separate_values
from hed.tools.util.data_util import get_new_dataframe, get_row_hash, separate_values


class KeyMap:
Expand All @@ -11,6 +11,8 @@ class KeyMap:
key_cols (list): A list of column names that will be hashed into the keys for the map.
target_cols (list): An optional list of column names that will be inserted into data and later remapped.
Notes: This mapping converts all columns of type object to string.
"""
def __init__(self, key_cols, target_cols=None, name=''):
""" Information for remapping columns of tabular files.
Expand Down Expand Up @@ -77,7 +79,7 @@ def remap(self, data):
""" Remap the columns of a dataframe or columnar file.
Parameters:
data (DataFrame, str) : Columnar data (either DataFrame or filename) whose columns are to be remapped.
data (DataFrame, str): Columnar data (either DataFrame or filename) whose columns are to be remapped.
Returns:
tuple:
Expand All @@ -90,10 +92,10 @@ def remap(self, data):
"""

df_new = get_new_dataframe(data)
remove_quotes(df_new)
present_keys, missing_keys = separate_values(df_new.columns.values.tolist(), self.key_cols)
if missing_keys:
raise HedFileError("MissingKeys", f"File must have key columns {str(self.key_cols)}", "")
self.remove_quotes(df_new, columns=present_keys)
df_new[self.target_cols] = 'n/a'
missing_indices = self._remap(df_new)
return df_new, missing_indices
Expand Down Expand Up @@ -149,7 +151,9 @@ def update(self, data, allow_missing=True, keep_counts=True):
if keys_missing and not allow_missing:
raise HedFileError("MissingKeyColumn",
f"make_template data does not have key columns {str(keys_missing)}", "")

base_df = df[keys_present].copy()
self.remove_quotes(base_df)
if keys_missing:
base_df[keys_missing] = 'n/a'
if self.target_cols:
Expand Down Expand Up @@ -205,3 +209,23 @@ def _handle_update(self, row, row_list, next_pos, keep_counts):
if keep_counts:
self.count_dict[key] += 1
return key, pos_update

@staticmethod
def remove_quotes(df, columns=None):
""" Remove quotes from the specified columns and convert to string.
Parameters:
df (Dataframe): Dataframe to process by removing quotes.
columns (list): List of column names. If None, all columns are used.
Notes:
- Replacement is done in place.
"""

col_types = df.dtypes
if not columns:
columns = df.columns.values.tolist()
for index, col in enumerate(df.columns):
if col in columns and col_types.iloc[index] in ['string', 'object']:
df[col] = df[col].astype(str)
df.iloc[:, index] = df.iloc[:, index].str.replace('"', '')
df.iloc[:, index] = df.iloc[:, index].str.replace("'", "")
37 changes: 26 additions & 11 deletions hed/tools/analysis/tabular_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,14 @@ def __init__(self, value_cols=None, skip_cols=None, name=''):
f"Value columns {str(value_cols)} and skip columns {str(skip_cols)} cannot overlap", "")
if value_cols:
for value in value_cols:
self.value_info[value] = 0
self.value_info[value] = [0, 0]
if skip_cols:
self.skip_cols = skip_cols.copy()
else:
self.skip_cols = []
self.total_files = 0
self.total_events = 0
self.files = {}

def __str__(self):
indent = " "
Expand Down Expand Up @@ -72,12 +73,13 @@ def get_summary(self, as_json=False):
val_dict = {}
for v_key in sorted_v_keys:
val_dict[v_key] = cat_dict[v_key]
categorical_cols[f"{key} [categorical column] values"] = val_dict
categorical_cols[key] = val_dict
sorted_cols = sorted(map(str, list(self.value_info)))
value_cols = {}
for key in sorted_cols:
value_cols[f"{key} [value_column]"] = f"{self.value_info[key]} values"
summary = {"Summary name": self.name, "Categorical columns": categorical_cols, "Value columns": value_cols}
value_cols[key] = self.value_info[key]
summary = {"Summary name": self.name, "Total events": self.total_events, "Total files": self.total_files,
"Categorical columns": categorical_cols, "Value columns": value_cols}
if as_json:
return json.dumps(summary, indent=4)
else:
Expand All @@ -103,7 +105,7 @@ def get_number_unique(self, column_names=None):
counts[column_name] = len(self.categorical_info[column_name].keys())
return counts

def update(self, data):
def update(self, data, name=None):
""" Update the counts based on data.
Parameters:
Expand All @@ -113,9 +115,11 @@ def update(self, data):

if isinstance(data, list):
for filename in data:
self._update_dataframe(filename)
self._update_dataframe(filename, filename)
elif isinstance(data, str):
self._update_dataframe(data, data)
else:
self._update_dataframe(data)
self._update_dataframe(data, name)

def update_summary(self, tab_sum):
""" Add TabularSummary values to this object.
Expand All @@ -128,6 +132,10 @@ def update_summary(self, tab_sum):
- A new skip column cannot used.
"""
self.total_files = self.total_files + tab_sum.total_files
self.total_events = self.total_events + tab_sum.total_events
for file, key in tab_sum.files.items():
self.files[file] = ''
self._update_dict_skip(tab_sum)
self._update_dict_value(tab_sum)
self._update_dict_categorical(tab_sum)
Expand All @@ -138,17 +146,23 @@ def _update_categorical(self, tab_name, values):

total_values = self.categorical_info[tab_name]
for name, value in values.items():
total_values[name] = total_values.get(name, 0) + value
value_list = total_values.get(name, [0, 0])
if not isinstance(value, list):
value = [value, 1]
total_values[name] = [value_list[0] + value[0], value_list[1] + value[1]]

def _update_dataframe(self, data):
def _update_dataframe(self, data, name):
df = get_new_dataframe(data)
if name:
self.files[name] = ""
self.total_files = self.total_files + 1
self.total_events = self.total_events + len(df.index)
for col_name, col_values in df.items():
if self.skip_cols and col_name in self.skip_cols:
continue
if col_name in self.value_info.keys():
self.value_info[col_name] = self.value_info[col_name] + len(col_values)
self.value_info[col_name][0] = self.value_info[col_name][0] + len(col_values)
self.value_info[col_name][1] = self.value_info[col_name][1] + 1
else:
col_values = col_values.astype(str)
values = col_values.value_counts(ascending=True)
Expand Down Expand Up @@ -194,7 +208,8 @@ def _update_dict_value(self, col_dict):
elif col not in val_cols:
self.value_info[col] = col_dict.value_info[col]
else:
self.value_info[col] = self.value_info[col] + col_dict.value_info[col]
self.value_info[col] = [self.value_info[col][0] + col_dict.value_info[col][0],
self.value_info[col][1] + col_dict.value_info[col][1]]

@staticmethod
def get_columns_info(dataframe, skip_cols=None):
Expand Down
14 changes: 7 additions & 7 deletions hed/tools/remodeling/cli/run_remodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,27 +12,27 @@ def get_parser():
parser = argparse.ArgumentParser(description="Converts event files based on a json file specifying operations.")
parser.add_argument("data_dir", help="Full path of dataset root directory.")
parser.add_argument("remodel_path", help="Full path of the file with remodeling instructions.")
parser.add_argument("-t", "--task-names", dest="task_names", nargs="*", default=[], help="The name of the task.")
parser.add_argument("-b", "--bids-format", action='store_true', dest="use_bids",
help="If present, the dataset is in BIDS format with sidecars. HED analysis is available.")
parser.add_argument("-e", "--extensions", nargs="*", default=['.tsv'], dest="extensions",
help="File extensions to allow in locating files.")
parser.add_argument("-x", "--exclude-dirs", nargs="*", default=['derivatives'], dest="exclude_dirs",
help="Directories names to exclude from search for files.")
parser.add_argument("-f", "--file-suffix", dest="file_suffix", default='events',
help="Filename suffix excluding file type of items to be analyzed (events by default).")
parser.add_argument("-i", "--include-individual", action='store_true', dest="include_individual",
help="If present, individual files are summarized in addition to overall summary.")
parser.add_argument("-j", "--json-sidecar", dest="json_sidecar", nargs="?",
help="Optional path to JSON sidecar with HED information")
parser.add_argument("-n", "--backup-name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name",
help="Name of the default backup for remodeling")
parser.add_argument("-r", "--hed-versions", dest="hed_versions", nargs="*", default=[],
help="Optional list of HED schema versions used for annotation, include prefixes.")
parser.add_argument("-s", "--save-formats", nargs="*", default=['.json', '.txt'], dest="save_formats",
help="Format for saving any summaries, if any. If empty, then no summaries are saved.")
parser.add_argument("-b", "--bids-format", action='store_true', dest="use_bids",
help="If present, the dataset is in BIDS format with sidecars. HED analysis is available.")
parser.add_argument("-n", "--backup-name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name",
help="Name of the default backup for remodeling")
parser.add_argument("-t", "--task-names", dest="task_names", nargs="*", default=[], help="The name of the task.")
parser.add_argument("-v", "--verbose", action='store_true',
help="If present, output informative messages as computation progresses.")
parser.add_argument("-x", "--exclude-dirs", nargs="*", default=[], dest="exclude_dirs",
help="Directories names to exclude from search for files.")
return parser


Expand Down
12 changes: 6 additions & 6 deletions hed/tools/remodeling/cli/run_remodel_backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,19 @@
def get_parser():
parser = argparse.ArgumentParser(description="Creates a backup for the remodeling process.")
parser.add_argument("data_dir", help="Full path of dataset root directory.")
parser.add_argument("-e", "--extensions", nargs="*", default=['.tsv'], dest="extensions",
help="File extensions to allow in locating files. A * indicates all files allowed.")
parser.add_argument("-f", "--file-suffix", dest="file_suffix", nargs="*", default=['events'],
help="Filename suffix of files to be backed up. A * indicates all files allowed.")
parser.add_argument("-n", "--backup_name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name",
help="Name of the default backup for remodeling")
parser.add_argument("-t", "--task-names", dest="task_names", nargs="*", default=[], help="The name of the task.")
parser.add_argument("-v", "--verbose", action='store_true',
help="If present, output informative messages as computation progresses.")
parser.add_argument("-x", "--exclude-dirs", nargs="*", default=['derivatives'], dest="exclude_dirs",
help="Directories names to exclude from search for files. " +
"If omitted, no directories except the backup directory will be excluded." +
"Note data_dir/remodel/backup will always be excluded.")
parser.add_argument("-f", "--file-suffix", dest="file_suffix", nargs="*", default=['events'],
help="Filename suffix of files to be backed up. A * indicates all files allowed.")
parser.add_argument("-e", "--extensions", nargs="*", default=['.tsv'], dest="extensions",
help="File extensions to allow in locating files. A * indicates all files allowed.")
parser.add_argument("-v", "--verbose", action='store_true',
help="If present, output informative messages as computation progresses.")
return parser


Expand Down
8 changes: 4 additions & 4 deletions hed/tools/remodeling/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,10 @@ def run_operations(self, file_path, sidecar=None, verbose=False):
if verbose:
print(f"Reading {file_path}...")
df = self.get_data_file(file_path)
df = self.prep_events(df)
df = self.prep_data(df)
for operation in self.parsed_ops:
df = operation.do_op(self, df, file_path, sidecar=sidecar)
return self.post_prep_events(df)
return self.post_proc_data(df)

def save_context(self, save_formats=['.json', '.txt'], include_individual=True):
""" Save the summary files in the specified formats.
Expand Down Expand Up @@ -161,7 +161,7 @@ def parse_operations(operation_list):
return operations, []

@staticmethod
def prep_events(df):
def prep_data(df):
""" Replace all n/a entries in the data frame by np.NaN for processing.
Parameters:
Expand All @@ -171,7 +171,7 @@ def prep_events(df):
return df.replace('n/a', np.NaN)

@staticmethod
def post_prep_events(df):
def post_proc_data(df):
""" Replace all nan entries with 'n/a' for BIDS compliance
Parameters:
Expand Down
6 changes: 4 additions & 2 deletions hed/tools/remodeling/operations/base_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import json
from hed.tools.util.io_util import generate_filename

DISPLAY_INDENT = " "


class BaseContext(ABC):
""" Abstract base class for summary contexts. Should not be instantiated.
Expand Down Expand Up @@ -71,13 +73,13 @@ def get_text_summary(self, title='', include_individual=True):
sum_list = []
for name, individual_result in result["Individual files"].items():
sum_list.append(self._get_result_string(name, individual_result))
summary_details = summary_details + "\n" + "\n".join(sum_list)
summary_details = summary_details + "\n\nIndividual files:\n\n" + "\n\n".join(sum_list)
if title:
title_str = title + "\n"
else:
title_str = ''
sum_str = f"{title_str}Context name: {self.context_name}\n" + f"Context type: {self.context_type}\n" + \
f"Context filename: {self.context_filename}\n" + f"\nSummary details:\n{summary_details}"
f"Context filename: {self.context_filename}\n" + f"\nSummary details:\n\n{summary_details}"
return sum_str

def save(self, save_dir, file_formats=['.txt'], include_individual=True):
Expand Down
Loading

0 comments on commit 9c428ea

Please sign in to comment.