Merge branch 'svm' into develop

* svm: Added installation instructions Minor cleanup squash! Added multithreading to SVM `grid_search()` returns grid instead of void Added multithreading to SVM Fixed indentation Added original code for `grid_search() + deps Minor comment fixes
ocbe-uio · Jun 28, 2024 · 6969b04 · 6969b04
2 parents c4d7a40 + a84587f
commit 6969b04
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -2,6 +2,14 @@
 
 Python package for predicting pathological Complete Response (pCR) scores according to [Azimzade et al. (2023)](https://www.biorxiv.org/content/10.1101/2023.09.07.556655). The original source code is available at https://github.com/YounessAzimzade/XML-TME-NAC-BC.
 
+# Installation
+
+This package is under development and not yet available on PyPI. You can, nonetheless, install the development version directly from GitHub by running this command on your terminal:
+
+```bash
+pip install git+https://github.com/ocbe-uio/pCRscore.git
+```
+
 # References
 
 Explainable Machine Learning Reveals the Role of the Breast Tumor Microenvironment in Neoadjuvant Chemotherapy Outcome

diff --git a/src/pCRscore/discovery_svm.py b/src/pCRscore/discovery_svm.py
@@ -1,11 +1,14 @@
 import pandas
+import numpy
 from sklearn import preprocessing
+from sklearn.metrics import make_scorer, f1_score, accuracy_score
+from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
+from sklearn.svm import SVC
 
-def preprocess(data):
-    # TODO: add code from
-    # https://github.com/YounessAzimzade/XML-TME-NAC-BC/blob/main/Discovery%20SVM.ipynb
-    # section "Preprocessing"
+# TODO: add code from
+# https://github.com/YounessAzimzade/XML-TME-NAC-BC/blob/main/Discovery%20SVM.ipynb
 
+def preprocess(data):
     # Mapping the values in the 'Response' column to binary values 0 and 1
     resp = {'pCR': 1, 'RD': 0}
     data.Response = [resp[item] for item in data.Response]
@@ -34,15 +37,50 @@ def extract_features(data):
     X = pandas.DataFrame(data.drop(dropped_columns, axis = 1))
     d3 = pandas.DataFrame(data.drop(dropped_columns, axis = 1))
 
-    # P# Extract the target variable 'y' (dependent variable) from the 'data' DataFrame
+    # P# Extract the target variable 'y' (dependent variable)
     y = data['Response']
 
-    # Perform standardization on the features using the StandardScaler from sklearn
+    # Standardize the features using the StandardScaler from sklearn
     # This step scales the features to have mean 0 and standard deviation 1
-    # This is important for some machine learning algorithms that are sensitive to feature scales
+    # This is important for some machine learning algorithms that
+    # are sensitive to feature scales
     X = pandas.DataFrame(
         preprocessing.StandardScaler().fit(X).transform(X),
         columns = d3.columns
     )
 
     return X, y
+
+def grid_search(X, y, n_cores = 1):
+    # TODO: profile (with cProfile?) to possibly add progress bar
+    # Defining the parameter range for the hyperparameter grid search
+    param_grid = {
+        'C': numpy.exp(numpy.linspace(-12, 3, num = 50)),
+        'gamma': numpy.exp(numpy.linspace(-12, 1, num = 50)),
+        'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
+    }
+
+    # Define a custom scoring dictionary that includes F1 score and accuracy
+    scoring = {
+        'F1': make_scorer(f1_score),
+        'Accuracy': make_scorer(accuracy_score)
+    }
+
+    # Create a StratifiedKFold object with 5 splits for cross-validation
+    kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)
+
+    # Split the dataset into training and testing sets
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1 / 3)
+
+    # Create a GridSearchCV object with the SVC classifier, parameter grid,
+    # custom scoring, refit based on F1 score, 10-fold cross-validation, and
+    # no verbosity
+    grid = GridSearchCV(
+        SVC(class_weight='balanced'),
+        param_grid, scoring = scoring, refit = 'F1', cv = 10, n_jobs = n_cores
+    )
+
+    # Fit the model for grid search using the training data
+    grid.fit(X_train, y_train)
+
+    return grid