Merge branch 'main' into jacopo

valeriocaporioniunipi · May 16, 2024 · 341fbc4 · 341fbc4
2 parents 7b1519e + 5b4d9f9
commit 341fbc4
Show file tree

Hide file tree

Showing 7 changed files with 310 additions and 139 deletions.
diff --git a/code/__pycache__/abspath.cpython-311.pyc b/code/__pycache__/abspath.cpython-311.pyc
diff --git a/code/__pycache__/csvreader.cpython-311.pyc b/code/__pycache__/csvreader.cpython-311.pyc
diff --git a/code/abspath.py b/code/abspath.py
@@ -1,9 +1,9 @@
 import os
 
 
-def AbsolutePath(local_filename, data_folder):
+def abs_path(local_filename, data_folder):
     """
-    AbsolutePath gets the absolute path of the file given the name of the folder containing the data
+    abs_path gets the absolute path of the file given the name of the folder containing the data
     and the name of the file inside that folder and assuming that the repository contains a data folder
     and a code folder.
 

diff --git a/code/csvreader.py b/code/csvreader.py
@@ -4,14 +4,12 @@
 import pandas as pd
 from loguru import logger
 
-
-def GetData(csv_file, column_name=None, show_flag=False):
-
+def csv_reader(csv_file, column_name=None, show_flag=False):
     """
-    GetData allows to read the data from a CSV file and converts them into a NumPy array.
+    csv_reader allows to read the data from a CSV file and converts them into a NumPy array.
     It can also show the entire dataset as a Pandas dataframe on terminal
     or show a single column of the data table.
-    the GetData function does not show the dataframe, unless specified by changing show_flag argument. 
+    The csv_reader function does not show the dataframe, unless specified by changing show_flag argument.
 
     :param csvfile: path to the CSV file
     :type csvfile: str
@@ -21,7 +19,7 @@ def GetData(csv_file, column_name=None, show_flag=False):
     :type show_flag: bool
     :return: the function returns a multidimensional numpy array if no column_name is passed as argument, otherwise it returns a unidimensional numpy array 
     :rtype: numpy.ndarray
-
+    
     """
     df = pd.read_csv(csv_file, delimiter=';')
     if column_name is None:
@@ -34,6 +32,29 @@ def GetData(csv_file, column_name=None, show_flag=False):
             print(df[column_name])
         return np.array(df[column_name].values)
 
+def get_data(filename, target_name, ex_cols = 0):
+    """
+    get_data obtains the features and target arrays
+
+    Arguments:
+    - filename (str): name of the file which data are read from
+    - target_name (str): name of the column of the csv file that contains targets
+    - ex_cols (int): optional, default = 0. Excluded columns
+
+    Return:
+    - features (ndarray): array of features
+    - targets (ndarray): array of targets
+    """
+    logger.info(f'Reading data from file {os.path.basename(filename)}, with {target_name} as target column ')
+    features = csv_reader(filename)[:, ex_cols:]
+    targets = csv_reader(filename, target_name)
+
+    # Checking if the first dimension of features matches the length of targets
+    if len(features) != len(targets):
+        logger.error("Number of samples in features and targets do not match")
+
+    return features, targets
+
 
 def main():
     parser = argparse.ArgumentParser(description="CSV Reader - A tool to read CSV files with Pandas.")
@@ -46,12 +67,12 @@ def main():
 
     try:
         if args.command == "show":
-            GetData(args.filename, show_flag=True)
+            csv_reader(args.filename, show_flag=True)
         elif args.command == "show_column":
             if not args.column:
                 parser.error("The '--column' argument is required for 'show_column' command.")
             else:
-                GetData(args.filename, args.column, show_flag=True)
+                csv_reader(args.filename, args.column, show_flag=True)
     except FileNotFoundError as e:
         logger.error("File not found", e)
 

diff --git a/code/gaussian_reg.py b/code/gaussian_reg.py
@@ -8,13 +8,13 @@
 from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
 from sklearn.preprocessing import StandardScaler
 
-from abspath import AbsolutePath
-from csvreader import GetData
+from abspath import abs_path
+from csvreader import get_data
 
-def GaussRegression(filename, n_splits, ex_cols=0,  plot_flag=False):
+def gaussian_reg(filename, n_splits, ex_cols=0,  plot_flag=False):
     """
-    GaussRegression performs a gaussian regression with k-fold cross-validation
-    on the given dataset and prints evaluation metrics of the gaussian regression model.
+    gaussian_reg performs a Gaussian regression with k-fold cross-validation on the given dataset
+    and prints evaluation metrics of the gaussian regression model.
 
     :param filename: path to the CSV file containing the dataset 
     :type filename: str
@@ -29,8 +29,8 @@ def GaussRegression(filename, n_splits, ex_cols=0,  plot_flag=False):
     """
     # Loading data...
     #Importing features excluded first three columns: FILE_ID, AGE_AT_SCAN, SEX
-    x = GetData(filename)[:, ex_cols:] 
-    y = GetData(filename, "AGE_AT_SCAN")
+    x = get_data(filename)[:, ex_cols:] 
+    y = get_data(filename, "AGE_AT_SCAN")
 
     # Standardize features
     scaler = StandardScaler()
@@ -95,30 +95,42 @@ def GaussRegression(filename, n_splits, ex_cols=0,  plot_flag=False):
         # Show the plot
         plt.show()
 
-def main():
-    parser = argparse.ArgumentParser(description='Gaussian regression with k-fold cross-validation predicting the age of patients from magnetic resonance imaging')
-
-    parser.add_argument("filename", help="Name of the file that has to be analyzed")
-    parser.add_argument("--location", help="Location of the file, i.e. folder containing it")
-    parser.add_argument("--n_splits", type=int, default=5, help="Number of folds for k-folding cross-validation")
-    parser.add_argument("--ex_cols", type = int, default=3, help="Number of columns excluded when importing data")
-    parser.add_argument("--plot", action='store_true', help="Show the plot of actual vs predicted brain age")
+def gaussian_reg_parsing():
+    """
+    Parsing from terminal
+    """
+    parser = argparse.ArgumentParser(description=
+        'Gaussian regression predicting the age of patients from magnetic resonance imaging')
+
+    parser.add_argument("filename",
+                         help="Name of the file that has to be analized")
+    parser.add_argument("--target", default = "AGE_AT_SCAN",
+                        help="Name of the colums holding target values")
+    parser.add_argument("--location",
+                         help="Location of the file, i.e. folder containing it")
+    parser.add_argument("--folds", type = int, default = 5,
+                         help="Number of folds in the k-folding (>4, default 5)")
+    parser.add_argument("--ex_cols", type = int, default = 3,
+                         help="Number of columns excluded when importing (default 3)")
+    parser.add_argument("--plot", action="store_true",
+                         help="Show the plot of actual vs predicted brain age")
 
     args = parser.parse_args()
 
-    if args.n_splits > 4:
+    if args.folds > 4:
         try:
-            if not args.location:
-                GaussRegression(args.filename, n_splits=args.n_splits, ex_cols = args.ex_cols, plot_flag=args.plot)
-            else:
-                args.filename = AbsolutePath(args.filename, args.location)
-                GaussRegression(args.filename, n_splits=args.n_splits, ex_cols = args.ex_cols, plot_flag=args.plot)
+            args.filename = abs_path(args.filename,
+                                          args.location) if args.location else args.filename
+            logger.info(f"Opening file : {args.filename}")
+            features, targets = get_data(args.filename, args.target, args.ex_cols)
+            gaussian_reg(features, targets, args.epochs, args.folds,
+                       args.summary, args.history, args.plot)
         except FileNotFoundError:
             logger.error("File not found.")
-            return None
-    else: 
-        logger.error("Invalid number of folds: at least 5 folds required")
+    else:
+        logger.error("Invalid number of folds: at least 5 folds required.")
+
 
 if __name__ == "__main__":
-    main()
+    gaussian_reg_parsing()
 
diff --git a/code/linear_reg.py b/code/linear_reg.py
@@ -8,13 +8,13 @@
 from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
 from sklearn.preprocessing import StandardScaler
 
-from abspath import AbsolutePath
-from csvreader import GetData
+from abspath import abs_path
+from csvreader import get_data
 
-def LinRegression(filename, n_splits, ex_cols=0, plot_flag=False):
+def linear_reg(filename, n_splits, ex_cols=0, plot_flag=False):
 
     """
-    LinRegression performs linear regression with k-fold cross-validation on the
+    linear_reg performs linear regression with k-fold cross-validation on the
     given dataset and prints evaluation metrics of the linear regression model.
 
     :param filename: path to the CSV file containing the dataset 
@@ -30,8 +30,8 @@ def LinRegression(filename, n_splits, ex_cols=0, plot_flag=False):
     """
     # Loading data...
     #Importing features excluded first three columns: FILE_ID, AGE_AT_SCAN, SEX
-    x = GetData(filename)[:, ex_cols:]
-    y = GetData(filename, "AGE_AT_SCAN")
+    x = get_data(filename)[:, ex_cols:]
+    y = get_data(filename, "AGE_AT_SCAN")
 
     # Standardize features
     scaler = StandardScaler()
@@ -95,29 +95,41 @@ def LinRegression(filename, n_splits, ex_cols=0, plot_flag=False):
         # Show the plot
         plt.show()
 
-def main():
-    parser = argparse.ArgumentParser(description='Linear regression with k-fold cross-validation predicting the age of patients from magnetic resonance imaging')
-
-    parser.add_argument("filename", help="Name of the file that has to be analyzed")
-    parser.add_argument("--location", help="Location of the file, i.e. folder containing it")
-    parser.add_argument("--ex_cols", type = int, default = 3, help="Number of columns excluded when importing data")
-    parser.add_argument("--n_splits", type=int, default=5, help="Number of folds for k-folding cross-validation")
-    parser.add_argument("--plot", action='store_true', help="Show the plot of actual vs predicted brain age")
+def linear_reg_parsing():
+    """
+    Parsing from terminal
+    """
+    parser = argparse.ArgumentParser(description=
+        'Linear regression predicting the age of patients from magnetic resonance imaging')
+
+    parser.add_argument("filename",
+                         help="Name of the file that has to be analized")
+    parser.add_argument("--target", default = "AGE_AT_SCAN",
+                        help="Name of the colums holding target values")
+    parser.add_argument("--location",
+                         help="Location of the file, i.e. folder containing it")
+    parser.add_argument("--folds", type = int, default = 5,
+                         help="Number of folds in the k-folding (>4, default 5)")
+    parser.add_argument("--ex_cols", type = int, default = 3,
+                         help="Number of columns excluded when importing (default 3)")
+    parser.add_argument("--plot", action="store_true",
+                         help="Show the plot of actual vs predicted brain age")
 
     args = parser.parse_args()
 
-    if args.n_splits > 4:
+    if args.folds > 4:
         try:
-            if not args.location:
-                LinRegression(args.filename, n_splits=args.n_splits, ex_cols = args.ex_cols, plot_flag=args.plot)
-            else:
-                args.filename = AbsolutePath(args.filename, args.location)
-                LinRegression(args.filename, n_splits=args.n_splits, ex_cols = args.ex_cols, plot_flag=args.plot)
+            args.filename = abs_path(args.filename,
+                                          args.location) if args.location else args.filename
+            logger.info(f"Opening file : {args.filename}")
+            features, targets = get_data(args.filename, args.target, args.ex_cols)
+            linear_reg(features, targets, args.epochs, args.folds,
+                       args.summary, args.history, args.plot)
         except FileNotFoundError:
             logger.error("File not found.")
-            return None
-    else: 
-        logger.error("Invalid number of folds: at least 5 folds required")
+    else:
+        logger.error("Invalid number of folds: at least 5 folds required.")
+
 
 if __name__ == "__main__":
-    main()
+    linear_reg_parsing()