Skip to content

Commit

Permalink
Merge pull request #20 from valeriocaporioniunipi/valerio
Browse files Browse the repository at this point in the history
Valerio
  • Loading branch information
valeriocaporioniunipi authored May 16, 2024
2 parents aa52037 + 0e03650 commit ad2b0b9
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 23 deletions.
Binary file modified code/__pycache__/abspath.cpython-311.pyc
Binary file not shown.
Binary file modified code/__pycache__/csvreader.cpython-311.pyc
Binary file not shown.
53 changes: 47 additions & 6 deletions code/csvreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,9 @@ def csv_reader(csv_file, column_name=None, show_flag=False):
print(df[column_name])
return np.array(df[column_name].values)

def get_data(filename, target_name, ex_cols = 0):
def get_data(filename, target_name, ex_cols = 0, **kwargs):
"""
get_data obtains the features and target arrays
:param filename: path to the CSV file with the data
:type filename: str
:param target_name: optional (default = None): name of the column of the csv file that contains targets
Expand All @@ -44,18 +43,60 @@ def get_data(filename, target_name, ex_cols = 0):
:type ex_cols: int
:return: numpy arrays of features and target
:rtype: numpy.ndarray, numpy.array
"""
group_name = kwargs.get('group_name', None)
logger.info(f'Reading data from file {os.path.basename(filename)}, with {target_name} as target column ')
features = csv_reader(filename)[:, ex_cols:]
targets = csv_reader(filename, target_name)

# Checking if the first dimension of features matches the length of targets
if len(features) != len(targets):
logger.error("Number of samples in features and targets do not match")

return features, targets
if group_name:
group = csv_reader(filename, group_name)
return features, targets, group
else:
return features, targets

def oversampling(features, targets, **kwargs):
# Extract optional parameters
bins = kwargs.get('bins', 10)
group = kwargs.get('group', None)

# Calculate target histogram
hist, edges = np.histogram(targets, bins=bins)

# Find the bin with the maximum count
max_bin_index = np.argmax(hist)
max_count = hist[max_bin_index]

# Check if the bin with the maximum count has samples available
if max_count == 0:
raise ValueError("No samples available in the bin with the maximum count for oversampling. Adjust bin size or provide more data.")

# Oversample the minority classes to match the maximum count
oversampled_features = []
oversampled_targets = []
oversampled_group = [] if group is not None else None

for i in range(bins - 1):
# Find indices of samples within the current bin
bin_indices = np.where((targets >= edges[i]) & (targets < edges[i + 1]))[0]

# Randomly sample with replacement from the indices to match max_count
sampled_indices = np.random.choice(bin_indices, size=max_count, replace=True)

# Append the sampled features and targets to the oversampled lists
oversampled_features.append(features[sampled_indices])
oversampled_targets.append(targets[sampled_indices])
if group is not None:
oversampled_group.append(group[sampled_indices])

# Concatenate the oversampled features and targets
new_features = np.concatenate(oversampled_features)
new_targets = np.concatenate(oversampled_targets)
new_group = np.concatenate(oversampled_group) if group is not None else None

return new_features, new_targets, new_group if group is not None else (new_features, new_targets)

def csv_reader_parsing():

Expand Down
76 changes: 59 additions & 17 deletions code/neural_net.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
from scikeras.wrappers import KerasRegressor

from abspath import abs_path
from csvreader import get_data
from csvreader import get_data, oversampling

def create_neural_net(input_shape,
num_hidden_layers,
num_hidden_layers = 1,
num_hidden_layer_nodes = 32,
optimizer='adam',
metrics=['mae'],
summary_flag=False):
optimizer = 'adam',
metrics = ['mae'],
summary_flag = False):
"""
create_neural_net creates an instance of the Sequential class of Keras,
creating a Neural Network with variable hidden layers, each with 32 nodes,
Expand Down Expand Up @@ -119,12 +119,16 @@ def training(features, targets, model, epochs, **kwargs):

# Optional kwargs
n_splits = kwargs.get('n_splits', 5)
group = kwargs.get('group', None)
hist_flag = kwargs.get('hist_flag', False)
plot_flag = kwargs.get('plot_flag', False)
# Renaming data
x = features
y = targets

# Defining a boolean value if experimental and control groups are separated
isgroup = np.any(group)

# Standardization of features
scaler = StandardScaler()
# since k-folding is implemented, standardization occurs after data splitting
Expand All @@ -144,7 +148,10 @@ def training(features, targets, model, epochs, **kwargs):
figh, axh = plt.subplots(figsize=(10,8))

if plot_flag:
figp, axp = plt.subplots(figsize=(10, 8))
if isgroup:
figp, (axp, axp_group) = plt.subplots(1, 2, figsize=(20, 8))
else:
figp, axp = plt.subplots(figsize=(10, 8))

colormap = cmaps.get_cmap('tab20')
colors = [colormap(i) for i in range(n_splits + 1)]
Expand All @@ -160,6 +167,8 @@ def training(features, targets, model, epochs, **kwargs):
# Split data into training and testing sets
x_train, x_test = x[train_index], x[test_index]
y_train, y_test = y[train_index], y[test_index]
if isgroup:
group_test = group[test_index]

# Standandization (after the split)
x_train = scaler.fit_transform(x_train)
Expand Down Expand Up @@ -195,8 +204,20 @@ def training(features, targets, model, epochs, **kwargs):

# Plotting actual vs. predicted values for current fold
if plot_flag:
axp.scatter(y_test, y_pred, alpha=0.5, color = colors[i],
label=f'Fold {i} - MAE = {np.round(mae_scores[i-1], 2)}')
axp.scatter(y_test, y_pred,
alpha=0.5,
color = colors[i],
label=f'Fold {i} - MAE = {np.round(mae_scores[i-1], 2)}')
if isgroup:
y_test_exp = y_test[group_test == 1]
y_pred_exp = y_pred[group_test == 1]
y_test_control = y_test[group_test == -1]
y_pred_control = y_pred[group_test == -1]
axp_group.scatter(y_test_exp, y_pred_exp,color = 'k')
axp_group.scatter(y_test_control, y_pred_control, color = 'r')




if hist_flag:
axh.set_xlabel("epoch")
Expand All @@ -221,12 +242,20 @@ def training(features, targets, model, epochs, **kwargs):
# Setting plot labels and title
axp.set_xlabel('Actual')
axp.set_ylabel('Predicted')
axp.set_title('Actual vs. predicted age')
axp.set_title(f'Actual vs. predicted age - {n_splits} folds')

# Adding legend and grid to the plot
figp.legend()
# Adding legend and grid to the plots
figp.legend(loc = 'upper left')
axp.grid(True)

if isgroup:
axp_group.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
axp_group.set_xlabel('Actual')
axp_group.set_ylabel('Predicted')
axp_group.set_title(f'Actual vs. predicted age - exp. vs. control')
axp_group.grid(True)
exp_legend = axp_group.scatter([], [], marker = 'o', color = 'k', label = 'exp.')
control_legend = axp_group.scatter([], [], marker = 'o', color = 'r', label = 'control')
figp.legend(handles = [exp_legend, control_legend], loc='upper right')
else:
logger.info("Skipping the plot of actual vs predicted age ")

Expand All @@ -243,7 +272,7 @@ def neural_net_parsing():
parser.add_argument("filename",
help="Name of the file that has to be analized")
parser.add_argument("--target", default = "AGE_AT_SCAN",
help="Name of the colums holding target values")
help="Name of the column holding target values")
parser.add_argument("--location",
help="Location of the file, i.e. folder containing it")
parser.add_argument("--hidden_layers", type = int, default = 1,
Expand All @@ -262,6 +291,12 @@ def neural_net_parsing():
help="Show the history of the training")
parser.add_argument("--plot", action="store_true",
help="Show the plot of actual vs predicted brain age")
parser.add_argument("--group", default = 'DX_GROUP',
help="Name of the column indicating the group (experimental vs control)")
parser.add_argument("--overs", action = 'store_true', default = True,
help="Oversampling, done in order to have a flat distribution of targets (default = True).")
parser.add_argument("--bins", type = int, default = 10,
help="Number of bins in resampling (default 0 20)")
parser.add_argument("--grid", action = "store_true",
help="Grid search for hyperparameter optimization")

Expand All @@ -271,7 +306,9 @@ def neural_net_parsing():
args.filename = abs_path(args.filename,
args.location) if args.location else args.filename
logger.info(f"Opening file : {args.filename}")
features, targets = get_data(args.filename, args.target, args.ex_cols)
features, targets, group = get_data(args.filename, args.target, args.ex_cols, group_name = args.group)
if args.overs:
features, targets, group = oversampling(features, targets, group=group)
epochs = args.epochs
input_shape = np.shape(features[0])
if not args.grid:
Expand All @@ -284,6 +321,7 @@ def neural_net_parsing():
model,
epochs,
n_splits = args.folds,
group = group,
hist_flag = args.history,
plot_flag = args.plot)
else: # args.grid
Expand Down Expand Up @@ -317,15 +355,19 @@ def neural_net_parsing():
for mean, std, param in zip(means, stds, params):
logger.info(f"{mean} ({std}) with: {param}")
model = create_neural_net(input_shape,
num_hidden_layers = grid_result.best_params_["model__num_hidden_layers"],
num_hidden_layer_nodes = grid_result.best_params_["model__num_hidden_nodes"],
optimizer= grid_result.best_params_["model__optimizer"],
num_hidden_layers =
grid_result.best_params_["model__num_hidden_layers"],
num_hidden_layer_nodes
= grid_result.best_params_["model__num_hidden_nodes"],
optimizer =
grid_result.best_params_["model__optimizer"],
summary_flag = args.summary)
training(features,
targets,
model,
epochs,
n_splits = args.folds,
group = group,
hist_flag = args.history,
plot_flag = args.plot)
except FileNotFoundError:
Expand Down

0 comments on commit ad2b0b9

Please sign in to comment.