Skip to content

Commit

Permalink
typos
Browse files Browse the repository at this point in the history
  • Loading branch information
tuonopettinato committed May 19, 2024
1 parent c9b7853 commit bb3b585
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 24 deletions.
Binary file modified .DS_Store
Binary file not shown.
Empty file added code/__init__.py.py
Empty file.
Binary file added code/__pycache__/utils.cpython-311.pyc
Binary file not shown.
8 changes: 7 additions & 1 deletion code/neural_net.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,8 @@ def neural_net_parsing():
help="Optimizer (default = 'rmsprop')")
parser.add_argument("--folds", type = int, default = 5,
help="Number of folds in the k-folding (>4, default 5)")
parser.add_argument("--dropout", type = float, default = 0.05,
help="Dropout rate in the NN (default 0.05)")
parser.add_argument("--ex_cols", type = int, default = 3,
help="Number of columns excluded when importing (default 3)")
parser.add_argument("--summary", action="store_true",
Expand All @@ -315,6 +317,8 @@ def neural_net_parsing():
"a flat distribution of targets (default = True).")
parser.add_argument("--bins", type = int, default = 10,
help="Number of bins in resampling (default 0 20)")
parser.add_argument("--harm",
help="Name of the column of sites, used for data harmonization")
parser.add_argument("--grid", action = "store_true",
help="Grid search for hyperparameter optimization")

Expand All @@ -327,13 +331,15 @@ def neural_net_parsing():
features, targets, group = get_data(args.filename,
args.target,
args.ex_cols,
group_name = args.group)
group_col = args.group,
site_col = args.harm)
epochs = args.epochs
input_shape = np.shape(features[0])
if not args.grid:
model = create_reg_nn(input_shape,
hidden_layers = args.hidden_layers,
hidden_nodes = args.hidden_nodes,
dropout = args.dropout,
optimizer = args.opt,
summary_flag = args.summary)
training(features,
Expand Down
88 changes: 65 additions & 23 deletions code/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
import numpy as np
import pandas as pd
from loguru import logger
from neuroHarmonize import harmonizationLearn, harmonizationApply


def abs_path(local_filename, data_folder):
"""
abs_path gets the absolute path of the file given the name of the folder containing the data
Gets the absolute path of the file given the name of the folder containing the data
and the name of the file inside that folder and assuming that the repository contains a data folder
and a code folder.
Expand Down Expand Up @@ -40,48 +42,88 @@ def csv_reader(filename, column_name=None, show_flag=False):
:type column_name: str, optional
:param show_flag: If True, displays the dataframe
:type show_flag: bool, optional
:return: A NumPy array of the entire dataset or the specified column
:rtype: numpy.ndarray
:return: A Pandas dataframe of the entire dataset or the specified column
:rtype: pandas.df
"""
df = pd.read_csv(filename, delimiter=';')
if column_name is None:
if show_flag:
print(df)
return df.values
return df
else:
if show_flag:
print(df[column_name])
return df[column_name].values
return df[column_name]

def get_data(filename, target_name, ex_cols=0, **kwargs):
def handle_spurious(df):
"""
Handles spurious zeroes and -9999 values in the DataFrame.
:param df: Input DataFrame
:type df: pd.DataFrame
:return: Cleaned DataFrame with spurious values handled
:rtype: pd.DataFrame
"""
Obtains the features and target arrays from a CSV file.
# Replace -9999 with NaN
df.replace(-9999, np.nan, inplace=True)
# Replace 0 with NaN
df.replace(0, np.nan, inplace=True)
# Fill NaN values with the mean of the respective columns
df.fillna(df.mean(), inplace=True)
return df

:param filename: Path to the CSV file

def get_data(filename, target_col, ex_cols=0, **kwargs):
"""
Obtains the features and target arrays from a CSV file. Optionally harmonizes the data
using neuroHarmonize and includes additional columns for grouping.
:param filename: Path to the CSV file.
:type filename: str
:param target_name: Name of the target column
:type target_name: str
:param ex_cols: Number of initial excluded columns (default is 0)
:param target_col: Name of the target column.
:type target_col: str
:param ex_cols: Number of initial columns to exclude from the features (default is 0).
:type ex_cols: int, optional
:return: NumPy arrays of features, targets (and optionally group)
:rtype: tuple(numpy.ndarray, numpy.ndarray)
:param kwargs: Additional keyword arguments:
- group_col: Name of the group column (optional).
- site_col: Name of the site column for harmonization (optional).
:return: NumPy arrays of features, targets, and optionally the group.
:rtype: tuple(numpy.ndarray, numpy.ndarray, numpy.ndarray or None)
"""
group_name = kwargs.get('group_name', None)
logger.info(f'Reading {os.path.basename(filename)}, with {target_name} as target column')

features = csv_reader(filename)[:, ex_cols:]
targets = csv_reader(filename, target_name)

group_col = kwargs.get('group_col', None)
site_col = kwargs.get('site_col', None)
logger.info(f'Reading {os.path.basename(filename)} with {target_col} as target column')
# Importing data from csv file as data
data = pd.read_csv(filename, delimiter = ';')
#
#if group_col is not None:
# data = data[data[group_col] == -1]

# Excluding the first ex_cols columns
features_df = data.iloc[:, ex_cols:]
# Removing spurious values from features and convertin to numpy matrix
features = handle_spurious(features_df).values
# Target array (numpy.ndarray)
targets = data[target_col].values
if site_col in data.columns:
covars = data[[site_col]]
covars.loc[:, site_col] = covars[site_col].str.rsplit('_', n=1).str[0]
covars.rename(columns={site_col: 'SITE'}, inplace=True) # Rename the column
_ , features = harmonizationLearn(features, covars)
logger.info('Harmonizing data with neuroHarmonize ')

if len(features) != len(targets):
logger.error("Number of samples in features and targets do not match")
logger.error("Number of samples in features and targets do not match ")
raise ValueError("Mismatch between number of features and targets samples")

if group_name:
group = csv_reader(filename, group_name)
if group_col:
logger.info(" Splitting into experimental "
f"& control group. Group column has name {group_col}")
group = data[group_col].values
return features, targets, group
# implicit else
return features, targets


def oversampling(features, targets, **kwargs):
"""
Oversamples minority classes in the dataset to balance class distribution.
Expand Down

0 comments on commit bb3b585

Please sign in to comment.