Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Develop #565

Merged
merged 2 commits into from
Dec 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion hed/tools/analysis/column_name_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def get_summary(self, as_json=False):
column_headers = []
for index in range(len(patterns)):
column_headers.append({'Column names': self.unique_headers[index], 'Files': patterns[index]})
summary = {"Summary name": self.name, "Columns": column_headers}
summary = {"Summary name": self.name, "Columns": column_headers, "Number files": len(self.file_dict)}
if as_json:
return json.dumps(summary, indent=4)
else:
Expand Down
7 changes: 5 additions & 2 deletions hed/tools/analysis/hed_type_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def update_summary(self, type_sum, total_events=0, file_id=None):

Parameters:
type_sum (dict): Contains the information about the value of a type.
total_events (int): Total number of events processed.
file_id (str): Unique identifier for the associated file.
"""

Expand All @@ -109,7 +110,7 @@ def update_summary(self, type_sum, total_events=0, file_id=None):
val_counts = self.type_dict[type_val]
val_counts.update(type_counts, file_id)
self.files[file_id] = ''
self.total_events = self.total_events + 1
self.total_events = self.total_events + total_events

def add_descriptions(self, type_defs):
""" Update this summary based on the type variable map.
Expand All @@ -130,6 +131,7 @@ def add_descriptions(self, type_defs):
type_count.level_counts[level]['description'] = level_dict['description']

def update(self, counts):
self.total_events = self.total_events + counts.total_events
for key, count in counts.type_dict.items():
if key not in self.type_dict:
self.type_dict[key] = HedTypeCount(count.type_value, count.type_tag, None)
Expand All @@ -142,4 +144,5 @@ def get_summary(self):
details = {}
for type_value, count in self.type_dict.items():
details[type_value] = count.get_summary()
return {'name': str(self.name), 'type_tag': self.type_tag, 'files': list(self.files.keys()), 'details': details}
return {'name': str(self.name), 'type_tag': self.type_tag, 'files': list(self.files.keys()),
'total_events': self.total_events, 'details': details}
30 changes: 27 additions & 3 deletions hed/tools/analysis/key_map.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pandas as pd
from hed.errors.exceptions import HedFileError
from hed.tools.util.data_util import get_new_dataframe, get_row_hash, remove_quotes, separate_values
from hed.tools.util.data_util import get_new_dataframe, get_row_hash, separate_values


class KeyMap:
Expand All @@ -11,6 +11,8 @@ class KeyMap:
key_cols (list): A list of column names that will be hashed into the keys for the map.
target_cols (list): An optional list of column names that will be inserted into data and later remapped.

Notes: This mapping converts all columns of type object to string.

"""
def __init__(self, key_cols, target_cols=None, name=''):
""" Information for remapping columns of tabular files.
Expand Down Expand Up @@ -77,7 +79,7 @@ def remap(self, data):
""" Remap the columns of a dataframe or columnar file.

Parameters:
data (DataFrame, str) : Columnar data (either DataFrame or filename) whose columns are to be remapped.
data (DataFrame, str): Columnar data (either DataFrame or filename) whose columns are to be remapped.

Returns:
tuple:
Expand All @@ -90,10 +92,10 @@ def remap(self, data):
"""

df_new = get_new_dataframe(data)
remove_quotes(df_new)
present_keys, missing_keys = separate_values(df_new.columns.values.tolist(), self.key_cols)
if missing_keys:
raise HedFileError("MissingKeys", f"File must have key columns {str(self.key_cols)}", "")
self.remove_quotes(df_new, columns=present_keys)
df_new[self.target_cols] = 'n/a'
missing_indices = self._remap(df_new)
return df_new, missing_indices
Expand Down Expand Up @@ -149,7 +151,9 @@ def update(self, data, allow_missing=True, keep_counts=True):
if keys_missing and not allow_missing:
raise HedFileError("MissingKeyColumn",
f"make_template data does not have key columns {str(keys_missing)}", "")

base_df = df[keys_present].copy()
self.remove_quotes(base_df)
if keys_missing:
base_df[keys_missing] = 'n/a'
if self.target_cols:
Expand Down Expand Up @@ -205,3 +209,23 @@ def _handle_update(self, row, row_list, next_pos, keep_counts):
if keep_counts:
self.count_dict[key] += 1
return key, pos_update

@staticmethod
def remove_quotes(df, columns=None):
""" Remove quotes from the specified columns and convert to string.

Parameters:
df (Dataframe): Dataframe to process by removing quotes.
columns (list): List of column names. If None, all columns are used.
Notes:
- Replacement is done in place.
"""

col_types = df.dtypes
if not columns:
columns = df.columns.values.tolist()
for index, col in enumerate(df.columns):
if col in columns and col_types.iloc[index] in ['string', 'object']:
df[col] = df[col].astype(str)
df.iloc[:, index] = df.iloc[:, index].str.replace('"', '')
df.iloc[:, index] = df.iloc[:, index].str.replace("'", "")
37 changes: 26 additions & 11 deletions hed/tools/analysis/tabular_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,14 @@ def __init__(self, value_cols=None, skip_cols=None, name=''):
f"Value columns {str(value_cols)} and skip columns {str(skip_cols)} cannot overlap", "")
if value_cols:
for value in value_cols:
self.value_info[value] = 0
self.value_info[value] = [0, 0]
if skip_cols:
self.skip_cols = skip_cols.copy()
else:
self.skip_cols = []
self.total_files = 0
self.total_events = 0
self.files = {}

def __str__(self):
indent = " "
Expand Down Expand Up @@ -72,12 +73,13 @@ def get_summary(self, as_json=False):
val_dict = {}
for v_key in sorted_v_keys:
val_dict[v_key] = cat_dict[v_key]
categorical_cols[f"{key} [categorical column] values"] = val_dict
categorical_cols[key] = val_dict
sorted_cols = sorted(map(str, list(self.value_info)))
value_cols = {}
for key in sorted_cols:
value_cols[f"{key} [value_column]"] = f"{self.value_info[key]} values"
summary = {"Summary name": self.name, "Categorical columns": categorical_cols, "Value columns": value_cols}
value_cols[key] = self.value_info[key]
summary = {"Summary name": self.name, "Total events": self.total_events, "Total files": self.total_files,
"Categorical columns": categorical_cols, "Value columns": value_cols}
if as_json:
return json.dumps(summary, indent=4)
else:
Expand All @@ -103,7 +105,7 @@ def get_number_unique(self, column_names=None):
counts[column_name] = len(self.categorical_info[column_name].keys())
return counts

def update(self, data):
def update(self, data, name=None):
""" Update the counts based on data.

Parameters:
Expand All @@ -113,9 +115,11 @@ def update(self, data):

if isinstance(data, list):
for filename in data:
self._update_dataframe(filename)
self._update_dataframe(filename, filename)
elif isinstance(data, str):
self._update_dataframe(data, data)
else:
self._update_dataframe(data)
self._update_dataframe(data, name)

def update_summary(self, tab_sum):
""" Add TabularSummary values to this object.
Expand All @@ -128,6 +132,10 @@ def update_summary(self, tab_sum):
- A new skip column cannot used.

"""
self.total_files = self.total_files + tab_sum.total_files
self.total_events = self.total_events + tab_sum.total_events
for file, key in tab_sum.files.items():
self.files[file] = ''
self._update_dict_skip(tab_sum)
self._update_dict_value(tab_sum)
self._update_dict_categorical(tab_sum)
Expand All @@ -138,17 +146,23 @@ def _update_categorical(self, tab_name, values):

total_values = self.categorical_info[tab_name]
for name, value in values.items():
total_values[name] = total_values.get(name, 0) + value
value_list = total_values.get(name, [0, 0])
if not isinstance(value, list):
value = [value, 1]
total_values[name] = [value_list[0] + value[0], value_list[1] + value[1]]

def _update_dataframe(self, data):
def _update_dataframe(self, data, name):
df = get_new_dataframe(data)
if name:
self.files[name] = ""
self.total_files = self.total_files + 1
self.total_events = self.total_events + len(df.index)
for col_name, col_values in df.items():
if self.skip_cols and col_name in self.skip_cols:
continue
if col_name in self.value_info.keys():
self.value_info[col_name] = self.value_info[col_name] + len(col_values)
self.value_info[col_name][0] = self.value_info[col_name][0] + len(col_values)
self.value_info[col_name][1] = self.value_info[col_name][1] + 1
else:
col_values = col_values.astype(str)
values = col_values.value_counts(ascending=True)
Expand Down Expand Up @@ -194,7 +208,8 @@ def _update_dict_value(self, col_dict):
elif col not in val_cols:
self.value_info[col] = col_dict.value_info[col]
else:
self.value_info[col] = self.value_info[col] + col_dict.value_info[col]
self.value_info[col] = [self.value_info[col][0] + col_dict.value_info[col][0],
self.value_info[col][1] + col_dict.value_info[col][1]]

@staticmethod
def get_columns_info(dataframe, skip_cols=None):
Expand Down
14 changes: 7 additions & 7 deletions hed/tools/remodeling/cli/run_remodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,27 +12,27 @@ def get_parser():
parser = argparse.ArgumentParser(description="Converts event files based on a json file specifying operations.")
parser.add_argument("data_dir", help="Full path of dataset root directory.")
parser.add_argument("remodel_path", help="Full path of the file with remodeling instructions.")
parser.add_argument("-t", "--task-names", dest="task_names", nargs="*", default=[], help="The name of the task.")
parser.add_argument("-b", "--bids-format", action='store_true', dest="use_bids",
help="If present, the dataset is in BIDS format with sidecars. HED analysis is available.")
parser.add_argument("-e", "--extensions", nargs="*", default=['.tsv'], dest="extensions",
help="File extensions to allow in locating files.")
parser.add_argument("-x", "--exclude-dirs", nargs="*", default=['derivatives'], dest="exclude_dirs",
help="Directories names to exclude from search for files.")
parser.add_argument("-f", "--file-suffix", dest="file_suffix", default='events',
help="Filename suffix excluding file type of items to be analyzed (events by default).")
parser.add_argument("-i", "--include-individual", action='store_true', dest="include_individual",
help="If present, individual files are summarized in addition to overall summary.")
parser.add_argument("-j", "--json-sidecar", dest="json_sidecar", nargs="?",
help="Optional path to JSON sidecar with HED information")
parser.add_argument("-n", "--backup-name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name",
help="Name of the default backup for remodeling")
parser.add_argument("-r", "--hed-versions", dest="hed_versions", nargs="*", default=[],
help="Optional list of HED schema versions used for annotation, include prefixes.")
parser.add_argument("-s", "--save-formats", nargs="*", default=['.json', '.txt'], dest="save_formats",
help="Format for saving any summaries, if any. If empty, then no summaries are saved.")
parser.add_argument("-b", "--bids-format", action='store_true', dest="use_bids",
help="If present, the dataset is in BIDS format with sidecars. HED analysis is available.")
parser.add_argument("-n", "--backup-name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name",
help="Name of the default backup for remodeling")
parser.add_argument("-t", "--task-names", dest="task_names", nargs="*", default=[], help="The name of the task.")
parser.add_argument("-v", "--verbose", action='store_true',
help="If present, output informative messages as computation progresses.")
parser.add_argument("-x", "--exclude-dirs", nargs="*", default=[], dest="exclude_dirs",
help="Directories names to exclude from search for files.")
return parser


Expand Down
12 changes: 6 additions & 6 deletions hed/tools/remodeling/cli/run_remodel_backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,19 @@
def get_parser():
parser = argparse.ArgumentParser(description="Creates a backup for the remodeling process.")
parser.add_argument("data_dir", help="Full path of dataset root directory.")
parser.add_argument("-e", "--extensions", nargs="*", default=['.tsv'], dest="extensions",
help="File extensions to allow in locating files. A * indicates all files allowed.")
parser.add_argument("-f", "--file-suffix", dest="file_suffix", nargs="*", default=['events'],
help="Filename suffix of files to be backed up. A * indicates all files allowed.")
parser.add_argument("-n", "--backup_name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name",
help="Name of the default backup for remodeling")
parser.add_argument("-t", "--task-names", dest="task_names", nargs="*", default=[], help="The name of the task.")
parser.add_argument("-v", "--verbose", action='store_true',
help="If present, output informative messages as computation progresses.")
parser.add_argument("-x", "--exclude-dirs", nargs="*", default=['derivatives'], dest="exclude_dirs",
help="Directories names to exclude from search for files. " +
"If omitted, no directories except the backup directory will be excluded." +
"Note data_dir/remodel/backup will always be excluded.")
parser.add_argument("-f", "--file-suffix", dest="file_suffix", nargs="*", default=['events'],
help="Filename suffix of files to be backed up. A * indicates all files allowed.")
parser.add_argument("-e", "--extensions", nargs="*", default=['.tsv'], dest="extensions",
help="File extensions to allow in locating files. A * indicates all files allowed.")
parser.add_argument("-v", "--verbose", action='store_true',
help="If present, output informative messages as computation progresses.")
return parser


Expand Down
8 changes: 4 additions & 4 deletions hed/tools/remodeling/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,10 @@ def run_operations(self, file_path, sidecar=None, verbose=False):
if verbose:
print(f"Reading {file_path}...")
df = self.get_data_file(file_path)
df = self.prep_events(df)
df = self.prep_data(df)
for operation in self.parsed_ops:
df = operation.do_op(self, df, file_path, sidecar=sidecar)
return self.post_prep_events(df)
return self.post_proc_data(df)

def save_context(self, save_formats=['.json', '.txt'], include_individual=True):
""" Save the summary files in the specified formats.
Expand Down Expand Up @@ -161,7 +161,7 @@ def parse_operations(operation_list):
return operations, []

@staticmethod
def prep_events(df):
def prep_data(df):
""" Replace all n/a entries in the data frame by np.NaN for processing.

Parameters:
Expand All @@ -171,7 +171,7 @@ def prep_events(df):
return df.replace('n/a', np.NaN)

@staticmethod
def post_prep_events(df):
def post_proc_data(df):
""" Replace all nan entries with 'n/a' for BIDS compliance

Parameters:
Expand Down
6 changes: 4 additions & 2 deletions hed/tools/remodeling/operations/base_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import json
from hed.tools.util.io_util import generate_filename

DISPLAY_INDENT = " "


class BaseContext(ABC):
""" Abstract base class for summary contexts. Should not be instantiated.
Expand Down Expand Up @@ -71,13 +73,13 @@ def get_text_summary(self, title='', include_individual=True):
sum_list = []
for name, individual_result in result["Individual files"].items():
sum_list.append(self._get_result_string(name, individual_result))
summary_details = summary_details + "\n" + "\n".join(sum_list)
summary_details = summary_details + "\n\nIndividual files:\n\n" + "\n\n".join(sum_list)
if title:
title_str = title + "\n"
else:
title_str = ''
sum_str = f"{title_str}Context name: {self.context_name}\n" + f"Context type: {self.context_type}\n" + \
f"Context filename: {self.context_filename}\n" + f"\nSummary details:\n{summary_details}"
f"Context filename: {self.context_filename}\n" + f"\nSummary details:\n\n{summary_details}"
return sum_str

def save(self, save_dir, file_formats=['.txt'], include_individual=True):
Expand Down
Loading