diff --git a/code/__pycache__/abspath.cpython-311.pyc b/code/__pycache__/abspath.cpython-311.pyc index 6eb146a..3e74eb9 100644 Binary files a/code/__pycache__/abspath.cpython-311.pyc and b/code/__pycache__/abspath.cpython-311.pyc differ diff --git a/code/__pycache__/csvreader.cpython-311.pyc b/code/__pycache__/csvreader.cpython-311.pyc index 5859048..95d2845 100644 Binary files a/code/__pycache__/csvreader.cpython-311.pyc and b/code/__pycache__/csvreader.cpython-311.pyc differ diff --git a/code/abspath.py b/code/abspath.py index 626801c..5475a42 100644 --- a/code/abspath.py +++ b/code/abspath.py @@ -1,9 +1,9 @@ import os -def AbsolutePath(local_filename, data_folder): +def abs_path(local_filename, data_folder): """ - AbsolutePath gets the absolute path of the file given the name of the folder containing the data + abs_path gets the absolute path of the file given the name of the folder containing the data and the name of the file inside that folder and assuming that the repository contains a data folder and a code folder. diff --git a/code/csvreader.py b/code/csvreader.py index 984db83..2f1216d 100644 --- a/code/csvreader.py +++ b/code/csvreader.py @@ -4,14 +4,12 @@ import pandas as pd from loguru import logger - -def GetData(csv_file, column_name=None, show_flag=False): - +def csv_reader(csv_file, column_name=None, show_flag=False): """ - GetData allows to read the data from a CSV file and converts them into a NumPy array. + csv_reader allows to read the data from a CSV file and converts them into a NumPy array. It can also show the entire dataset as a Pandas dataframe on terminal or show a single column of the data table. - the GetData function does not show the dataframe, unless specified by changing show_flag argument. + The csv_reader function does not show the dataframe, unless specified by changing show_flag argument. :param csvfile: path to the CSV file :type csvfile: str @@ -21,7 +19,7 @@ def GetData(csv_file, column_name=None, show_flag=False): :type show_flag: bool :return: the function returns a multidimensional numpy array if no column_name is passed as argument, otherwise it returns a unidimensional numpy array :rtype: numpy.ndarray - + """ df = pd.read_csv(csv_file, delimiter=';') if column_name is None: @@ -34,6 +32,29 @@ def GetData(csv_file, column_name=None, show_flag=False): print(df[column_name]) return np.array(df[column_name].values) +def get_data(filename, target_name, ex_cols = 0): + """ + get_data obtains the features and target arrays + + Arguments: + - filename (str): name of the file which data are read from + - target_name (str): name of the column of the csv file that contains targets + - ex_cols (int): optional, default = 0. Excluded columns + + Return: + - features (ndarray): array of features + - targets (ndarray): array of targets + """ + logger.info(f'Reading data from file {os.path.basename(filename)}, with {target_name} as target column ') + features = csv_reader(filename)[:, ex_cols:] + targets = csv_reader(filename, target_name) + + # Checking if the first dimension of features matches the length of targets + if len(features) != len(targets): + logger.error("Number of samples in features and targets do not match") + + return features, targets + def main(): parser = argparse.ArgumentParser(description="CSV Reader - A tool to read CSV files with Pandas.") @@ -46,12 +67,12 @@ def main(): try: if args.command == "show": - GetData(args.filename, show_flag=True) + csv_reader(args.filename, show_flag=True) elif args.command == "show_column": if not args.column: parser.error("The '--column' argument is required for 'show_column' command.") else: - GetData(args.filename, args.column, show_flag=True) + csv_reader(args.filename, args.column, show_flag=True) except FileNotFoundError as e: logger.error("File not found", e) diff --git a/code/gaussian_reg.py b/code/gaussian_reg.py index 793f544..83c981a 100644 --- a/code/gaussian_reg.py +++ b/code/gaussian_reg.py @@ -8,13 +8,13 @@ from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score from sklearn.preprocessing import StandardScaler -from abspath import AbsolutePath -from csvreader import GetData +from abspath import abs_path +from csvreader import get_data -def GaussRegression(filename, n_splits, ex_cols=0, plot_flag=False): +def gaussian_reg(filename, n_splits, ex_cols=0, plot_flag=False): """ - GaussRegression performs a gaussian regression with k-fold cross-validation - on the given dataset and prints evaluation metrics of the gaussian regression model. + gaussian_reg performs a Gaussian regression with k-fold cross-validation on the given dataset + and prints evaluation metrics of the gaussian regression model. :param filename: path to the CSV file containing the dataset :type filename: str @@ -29,8 +29,8 @@ def GaussRegression(filename, n_splits, ex_cols=0, plot_flag=False): """ # Loading data... #Importing features excluded first three columns: FILE_ID, AGE_AT_SCAN, SEX - x = GetData(filename)[:, ex_cols:] - y = GetData(filename, "AGE_AT_SCAN") + x = get_data(filename)[:, ex_cols:] + y = get_data(filename, "AGE_AT_SCAN") # Standardize features scaler = StandardScaler() @@ -95,30 +95,42 @@ def GaussRegression(filename, n_splits, ex_cols=0, plot_flag=False): # Show the plot plt.show() -def main(): - parser = argparse.ArgumentParser(description='Gaussian regression with k-fold cross-validation predicting the age of patients from magnetic resonance imaging') - - parser.add_argument("filename", help="Name of the file that has to be analyzed") - parser.add_argument("--location", help="Location of the file, i.e. folder containing it") - parser.add_argument("--n_splits", type=int, default=5, help="Number of folds for k-folding cross-validation") - parser.add_argument("--ex_cols", type = int, default=3, help="Number of columns excluded when importing data") - parser.add_argument("--plot", action='store_true', help="Show the plot of actual vs predicted brain age") +def gaussian_reg_parsing(): + """ + Parsing from terminal + """ + parser = argparse.ArgumentParser(description= + 'Gaussian regression predicting the age of patients from magnetic resonance imaging') + + parser.add_argument("filename", + help="Name of the file that has to be analized") + parser.add_argument("--target", default = "AGE_AT_SCAN", + help="Name of the colums holding target values") + parser.add_argument("--location", + help="Location of the file, i.e. folder containing it") + parser.add_argument("--folds", type = int, default = 5, + help="Number of folds in the k-folding (>4, default 5)") + parser.add_argument("--ex_cols", type = int, default = 3, + help="Number of columns excluded when importing (default 3)") + parser.add_argument("--plot", action="store_true", + help="Show the plot of actual vs predicted brain age") args = parser.parse_args() - if args.n_splits > 4: + if args.folds > 4: try: - if not args.location: - GaussRegression(args.filename, n_splits=args.n_splits, ex_cols = args.ex_cols, plot_flag=args.plot) - else: - args.filename = AbsolutePath(args.filename, args.location) - GaussRegression(args.filename, n_splits=args.n_splits, ex_cols = args.ex_cols, plot_flag=args.plot) + args.filename = abs_path(args.filename, + args.location) if args.location else args.filename + logger.info(f"Opening file : {args.filename}") + features, targets = get_data(args.filename, args.target, args.ex_cols) + gaussian_reg(features, targets, args.epochs, args.folds, + args.summary, args.history, args.plot) except FileNotFoundError: logger.error("File not found.") - return None - else: - logger.error("Invalid number of folds: at least 5 folds required") + else: + logger.error("Invalid number of folds: at least 5 folds required.") + if __name__ == "__main__": - main() + gaussian_reg_parsing() diff --git a/code/linear_reg.py b/code/linear_reg.py index 3ad6c62..0487dc1 100644 --- a/code/linear_reg.py +++ b/code/linear_reg.py @@ -8,13 +8,13 @@ from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score from sklearn.preprocessing import StandardScaler -from abspath import AbsolutePath -from csvreader import GetData +from abspath import abs_path +from csvreader import get_data -def LinRegression(filename, n_splits, ex_cols=0, plot_flag=False): +def linear_reg(filename, n_splits, ex_cols=0, plot_flag=False): """ - LinRegression performs linear regression with k-fold cross-validation on the + linear_reg performs linear regression with k-fold cross-validation on the given dataset and prints evaluation metrics of the linear regression model. :param filename: path to the CSV file containing the dataset @@ -30,8 +30,8 @@ def LinRegression(filename, n_splits, ex_cols=0, plot_flag=False): """ # Loading data... #Importing features excluded first three columns: FILE_ID, AGE_AT_SCAN, SEX - x = GetData(filename)[:, ex_cols:] - y = GetData(filename, "AGE_AT_SCAN") + x = get_data(filename)[:, ex_cols:] + y = get_data(filename, "AGE_AT_SCAN") # Standardize features scaler = StandardScaler() @@ -95,29 +95,41 @@ def LinRegression(filename, n_splits, ex_cols=0, plot_flag=False): # Show the plot plt.show() -def main(): - parser = argparse.ArgumentParser(description='Linear regression with k-fold cross-validation predicting the age of patients from magnetic resonance imaging') - - parser.add_argument("filename", help="Name of the file that has to be analyzed") - parser.add_argument("--location", help="Location of the file, i.e. folder containing it") - parser.add_argument("--ex_cols", type = int, default = 3, help="Number of columns excluded when importing data") - parser.add_argument("--n_splits", type=int, default=5, help="Number of folds for k-folding cross-validation") - parser.add_argument("--plot", action='store_true', help="Show the plot of actual vs predicted brain age") +def linear_reg_parsing(): + """ + Parsing from terminal + """ + parser = argparse.ArgumentParser(description= + 'Linear regression predicting the age of patients from magnetic resonance imaging') + + parser.add_argument("filename", + help="Name of the file that has to be analized") + parser.add_argument("--target", default = "AGE_AT_SCAN", + help="Name of the colums holding target values") + parser.add_argument("--location", + help="Location of the file, i.e. folder containing it") + parser.add_argument("--folds", type = int, default = 5, + help="Number of folds in the k-folding (>4, default 5)") + parser.add_argument("--ex_cols", type = int, default = 3, + help="Number of columns excluded when importing (default 3)") + parser.add_argument("--plot", action="store_true", + help="Show the plot of actual vs predicted brain age") args = parser.parse_args() - if args.n_splits > 4: + if args.folds > 4: try: - if not args.location: - LinRegression(args.filename, n_splits=args.n_splits, ex_cols = args.ex_cols, plot_flag=args.plot) - else: - args.filename = AbsolutePath(args.filename, args.location) - LinRegression(args.filename, n_splits=args.n_splits, ex_cols = args.ex_cols, plot_flag=args.plot) + args.filename = abs_path(args.filename, + args.location) if args.location else args.filename + logger.info(f"Opening file : {args.filename}") + features, targets = get_data(args.filename, args.target, args.ex_cols) + linear_reg(features, targets, args.epochs, args.folds, + args.summary, args.history, args.plot) except FileNotFoundError: logger.error("File not found.") - return None - else: - logger.error("Invalid number of folds: at least 5 folds required") + else: + logger.error("Invalid number of folds: at least 5 folds required.") + if __name__ == "__main__": - main() + linear_reg_parsing() diff --git a/code/neural_net.py b/code/neural_net.py index cf8b907..2572989 100644 --- a/code/neural_net.py +++ b/code/neural_net.py @@ -1,30 +1,100 @@ +""" +Module neural_net trains neural networks in order to guess age from brain features +""" import argparse import numpy as np from loguru import logger from matplotlib import pyplot as plt +from matplotlib import colormaps as cmaps from keras import Sequential from keras import layers - -# from sklearn.model_selection import train_test_split -from sklearn.model_selection import KFold +from sklearn.model_selection import KFold, GridSearchCV from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score from sklearn.preprocessing import StandardScaler +from scikeras.wrappers import KerasRegressor + +from abspath import abs_path +from csvreader import get_data + +def create_neural_net(input_shape, + num_hidden_layers, + num_hidden_layer_nodes = 32, + optimizer='adam', + metrics=['mae'], + summary_flag=False): + """ + create_neural_net creates an instance of the Sequential class of Keras, + creating a Neural Network with variable hidden layers, each with 32 nodes, + and setting the initial weights at random values. + + Arguments: + - input_shape (tuple): shape of the data given to the input layer of the NN + - num_hidden_layers (int). Number of hidden layers in the network + - optimizer (str): optional, default = 'adam'. Optimizer to use + - metrics (list): optional, default = ['mae']. List of metrics to use + - summary_flag (bool): optional, default = False. Show the summary of the NN + + Return: the instance of the Sequential class, i.e. the model object + """ + + # Defining the model + model = Sequential() + model.add(layers.Input(shape=input_shape)) -from abspath import AbsolutePath -from csvreader import GetData + # Adding variable number of hidden layers + for _ in range(num_hidden_layers): + model.add(layers.Dense(num_hidden_layer_nodes, activation='relu')) -def NeuralNetwork(filename, epochs, n_splits, ex_cols = 0, - summary_flag=False, hist_flag=False, plot_flag=False): + model.add(layers.Dense(1, activation='linear')) # Output layer + + # Compiling the model + model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=metrics) + # Printing the summary, if specified + if summary_flag: + logger.info("Model successfully compiled, showing detailed summary ") + model.summary() + else: + logger.info(f"Model successfully compiled with {num_hidden_layers} hidden layers") + return model + +def build_model(input_shape, + num_hidden_layers=1, + num_hidden_layer_nodes = 32, + optimizer='adam', + **kwargs): + """ + Wrapper function to create a Keras model with specified hyperparameters """ - NeuralNetwork creates a neural network. Inputs data are splitted in two parts: 'train' and - 'test'; both inputs are normalized in order to have zero as mean and one as variance. - As output it prints + return create_neural_net(input_shape, num_hidden_layers, optimizer) + + +def training(features, targets, model, epochs, **kwargs): + """" + training trains a neural network with k-folding + + Arguments: + - features (ndarray): matrix of features + - targets (ndarray): array of targets + - model (SequentialType): NN model, instance of Sequential class + - epochs (int): number of epochs during neural network training + - **kwargs: additional keyword arguments for configuring the function behavior + - n_splits (int): number of folds for cross-validation + - hist_flag (bool): optional, default = False. Plot a graph showing val_loss + (labeled as validation) vs loss (labeled as training) during epochs. + - plot_flag (bool): optional, default = False. + Show the plot of actual vs predicted brain age. + + Return: + - scores (ndarray): array holding MAE, MSE and R-squared, averaged among the folds + + Printing: - MAE (mean absolute error) - MSE (mean squared error) - R-squared - and optionally shows + Optionally showing: + - Actual vs Predicted brain age scatter plot - Training history plot @@ -46,10 +116,14 @@ def NeuralNetwork(filename, epochs, n_splits, ex_cols = 0, return: None """ - # Loading data... - #Importing features excluded first three columns: FILE_ID, AGE_AT_SCAN, SEX - x = GetData(filename)[:, ex_cols:] - y = GetData(filename, "AGE_AT_SCAN") + + # Optional kwargs + n_splits = kwargs.get('n_splits', 5) + hist_flag = kwargs.get('hist_flag', False) + plot_flag = kwargs.get('plot_flag', False) + # Renaming data + x = features + y = targets # Standardization of features scaler = StandardScaler() @@ -57,42 +131,29 @@ def NeuralNetwork(filename, epochs, n_splits, ex_cols = 0, # in order to avoid information leakage (information from the validation or test set # would inadvertently influence the preprocessing steps). - # Initialize k-fold cross-validation - kf = KFold(n_splits=n_splits) + # Initialization of k-fold cross-validation + kf = KFold(n_splits=n_splits, shuffle = True) - # Initialize lists to store evaluation metrics + # Initialization of lists to store evaluation metrics mae_scores = [] mse_scores = [] r2_scores = [] + # Initializing figures for plotting and creating rlated colours + if hist_flag: + figh, axh = plt.subplots(figsize=(10,8)) - # Defining the model - model = Sequential() - model.add(layers.Input(shape = np.shape(x[0]))) - # Defining the model outside is better from a computational-resources point of view. - # The shape of x[0] is the same of x_train[0], which will be defined later, and - # [0] is needed in order to pass the shape of a single feature array (the first, for instance) - model.add(layers.Dense(64, activation='relu')) - model.add(layers.Dense(32, activation='relu')) - model.add(layers.Dense(1, activation='linear')) # Output layer - - # Compiling the model - model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae']) - initial_weights = model.get_weights() - logger.info("Model successfully compiled.") + if plot_flag: + figp, axp = plt.subplots(figsize=(10, 8)) - # Printing the summary, if specified - if summary_flag: - model.summary() - else: - logger.info("Skipping model summary.") + colormap = cmaps.get_cmap('tab20') + colors = [colormap(i) for i in range(n_splits + 1)] - # Initialize figures for plotting - if hist_flag: - fig1, ax1 = plt.subplots(figsize=(10,8)) + # Storing the initial weights in order to refresh them after every fold training + initial_weights = model.get_weights() - if plot_flag: - fig2, ax2 = plt.subplots(figsize=(10, 8)) + # Initializing the list that will hold the models once trained + models =[] # Perform k-fold cross-validation for i, (train_index, test_index) in enumerate(kf.split(x), 1): @@ -104,24 +165,26 @@ def NeuralNetwork(filename, epochs, n_splits, ex_cols = 0, x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) - # Resetting model's weights + # Training the model (after having re-initialized the weights) model.set_weights(initial_weights) - - # Training the model - logger.info(f"Training the model with dataset {i}/{n_splits}") + logger.info(f"Training the model with dataset {i}/{n_splits} for {epochs} epochs ") history = model.fit(x_train, y_train, epochs=epochs, batch_size=32, validation_split=0.1) # Predict on the test set y_pred = model.predict(x_test) + # Appending the model to models list + models.append(model) + #Appending vectors with history data if hist_flag: validation_loss = history.history['val_loss'] training_loss = history.history['loss'] - ax1.plot(training_loss, label=f"Tr. {i}", color = "r", alpha = 1/i) - ax1.plot(validation_loss, label=f"Val. {i}", color = "k", alpha = 1/i) + axh.plot(training_loss, label=f"Tr. {i}", color = colors[i]) + axh.plot(validation_loss, label=f"Val. {i}", color = colors[i], ls = 'dashed') + axh.set_yscale('log') - # Evaluate the model + # Evaluating the model mae = mean_absolute_error(y_test, y_pred) mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) @@ -130,44 +193,47 @@ def NeuralNetwork(filename, epochs, n_splits, ex_cols = 0, mse_scores.append(mse) r2_scores.append(r2) - # Plot actual vs. predicted values for current fold + # Plotting actual vs. predicted values for current fold if plot_flag: - ax2.scatter(y_test, y_pred, alpha=0.5, + axp.scatter(y_test, y_pred, alpha=0.5, color = colors[i], label=f'Fold {i} - MAE = {np.round(mae_scores[i-1], 2)}') if hist_flag: - ax1.set_xlabel("epoch") - ax1.set_ylabel("loss") - ax1.set_title('History of training') - fig1.legend() + axh.set_xlabel("epoch") + axh.set_ylabel("loss") + axh.set_title(f'History losses in {epochs} epochs') + figh.legend() else: - logger.info("Skipping the plot of training history.") + logger.info("Skipping the plot of training history ") - # Print average evaluation metrics over all folds + # Printing average evaluation metrics over all folds print("Mean Absolute Error:", np.mean(mae_scores)) print("Mean Squared Error:", np.mean(mse_scores)) print("R-squared:", np.mean(r2_scores)) + scores = np.array([np.mean(mae_scores), np.mean(mse_scores), np.mean(r2_scores)]) + if plot_flag: - # Plot the ideal line (y=x) - ax2.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2) + # Plotting the ideal line (y=x) + axp.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2) - # Set plot labels and title - ax2.set_xlabel('Actual') - ax2.set_ylabel('Predicted') - ax2.set_title('Actual vs. Predicted Brain Age') + # Setting plot labels and title + axp.set_xlabel('Actual') + axp.set_ylabel('Predicted') + axp.set_title('Actual vs. predicted age') - # Add legend and grid to the plot - fig2.legend() - ax2.grid(True) + # Adding legend and grid to the plot + figp.legend() + axp.grid(True) else: - logger.info("Skipping the plot of actual vs predicted brain age.") + logger.info("Skipping the plot of actual vs predicted age ") plt.show() + return scores -def main(): +def neural_net_parsing(): """ Parsing from terminal """ @@ -176,8 +242,14 @@ def main(): parser.add_argument("filename", help="Name of the file that has to be analized") + parser.add_argument("--target", default = "AGE_AT_SCAN", + help="Name of the colums holding target values") parser.add_argument("--location", help="Location of the file, i.e. folder containing it") + parser.add_argument("--hidden_layers", type = int, default = 1, + help="Number of hidden layers in the neural network") + parser.add_argument("--hidden_nodes", type = int, default = 32, + help="Number of hidden layer nodes in the neural network") parser.add_argument("--epochs", type = int, default = 50, help="Number of epochs of training (default 50)") parser.add_argument("--folds", type = int, default = 5, @@ -190,21 +262,75 @@ def main(): help="Show the history of the training") parser.add_argument("--plot", action="store_true", help="Show the plot of actual vs predicted brain age") + parser.add_argument("--grid", action = "store_true", + help="Grid search for hyperparameter optimization") args = parser.parse_args() - if args.folds > 4: - try: - args.filename = AbsolutePath(args.filename, - args.location) if args.location else args.filename - logger.info(f"Opening file : {args.filename}") - NeuralNetwork(args.filename, args.epochs, args.folds, - args.ex_cols, args.summary, args.history, args.plot) - except FileNotFoundError: - logger.error("File not found.") - else: - logger.error("Invalid number of folds: at least 5 folds required.") + try: + args.filename = abs_path(args.filename, + args.location) if args.location else args.filename + logger.info(f"Opening file : {args.filename}") + features, targets = get_data(args.filename, args.target, args.ex_cols) + epochs = args.epochs + input_shape = np.shape(features[0]) + if not args.grid: + model = create_neural_net(input_shape, + num_hidden_layers = args.hidden_layers, + num_hidden_layer_nodes = args.hidden_nodes, + summary_flag = args.summary) + training(features, + targets, + model, + epochs, + n_splits = args.folds, + hist_flag = args.history, + plot_flag = args.plot) + else: # args.grid + param_grid = { + 'model__num_hidden_layers': [1, 4, 6], + 'model__num_hidden_nodes' : [32, 48], + 'model__optimizer': ['adam', 'sgd', 'rmsprop'] + } + + keras_regressor = KerasRegressor(model=lambda **kwargs: build_model(input_shape, **kwargs), + epochs=epochs, + batch_size=32, + verbose=0) + grid = GridSearchCV(estimator=keras_regressor, + param_grid=param_grid, + scoring='neg_mean_absolute_error', + refit = False, + cv = args.folds) + scaler = StandardScaler() + x_scaled = scaler.fit_transform(features) + + # Fitting grid search + logger.info("Starting Grid Search for hyperparameter optimization") + grid_result = grid.fit(x_scaled, targets) + + # Summarize results + logger.info(f"Best: {grid_result.best_score_} using {grid_result.best_params_}") + means = grid_result.cv_results_['mean_test_score'] + stds = grid_result.cv_results_['std_test_score'] + params = grid_result.cv_results_['params'] + for mean, std, param in zip(means, stds, params): + logger.info(f"{mean} ({std}) with: {param}") + model = create_neural_net(input_shape, + num_hidden_layers = grid_result.best_params_["model__num_hidden_layers"], + num_hidden_layer_nodes = grid_result.best_params_["model__num_hidden_nodes"], + optimizer= grid_result.best_params_["model__optimizer"], + summary_flag = args.summary) + training(features, + targets, + model, + epochs, + n_splits = args.folds, + hist_flag = args.history, + plot_flag = args.plot) + except FileNotFoundError: + logger.error("File not found.") if __name__ == "__main__": - main() + neural_net_parsing()