From e3ece58d36db4c12f030c5ea0b8f7525477c1f3c Mon Sep 17 00:00:00 2001
From: Waldir Leoncio <w.l.netto@medisin.uio.no>
Date: Wed, 31 Jul 2024 13:24:53 +0200
Subject: [PATCH 1/4] Added `verbose` to `grid_search()`

---
 src/pCRscore/discovery_svm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pCRscore/discovery_svm.py b/src/pCRscore/discovery_svm.py
index 60c835c..2ad53aa 100644
--- a/src/pCRscore/discovery_svm.py
+++ b/src/pCRscore/discovery_svm.py
@@ -51,8 +51,7 @@ def extract_features(data):
 
     return X, y
 
-def grid_search(X, y, n_cores = 1):
-    # TODO: profile (with cProfile?) to possibly add progress bar
+def grid_search(X, y, n_cores = 1, verbose = 0):
     # Defining the parameter range for the hyperparameter grid search
     param_grid = {
         'C': numpy.exp(numpy.linspace(-12, 3, num = 50)),
@@ -77,7 +76,8 @@ def grid_search(X, y, n_cores = 1):
     # no verbosity
     grid = GridSearchCV(
         SVC(class_weight='balanced'),
-        param_grid, scoring = scoring, refit = 'F1', cv = 10, n_jobs = n_cores
+        param_grid, scoring = scoring, refit = 'F1', cv = 10, n_jobs = n_cores,
+        verbose = verbose
     )
 
     # Fit the model for grid search using the training data

From aec9b5aa57b23c8f0f48a499cf2e733eb96478e5 Mon Sep 17 00:00:00 2001
From: Waldir Leoncio <w.l.netto@medisin.uio.no>
Date: Wed, 31 Jul 2024 13:25:19 +0200
Subject: [PATCH 2/4] Added `evaluate_model()` (#1)

---
 src/pCRscore/discovery_svm.py | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/src/pCRscore/discovery_svm.py b/src/pCRscore/discovery_svm.py
index 2ad53aa..1c1096f 100644
--- a/src/pCRscore/discovery_svm.py
+++ b/src/pCRscore/discovery_svm.py
@@ -2,8 +2,10 @@
 import numpy
 from sklearn import preprocessing
 from sklearn.metrics import make_scorer, f1_score, accuracy_score
-from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
+from sklearn.model_selection import \
+    GridSearchCV, train_test_split, StratifiedKFold, KFold, cross_val_score
 from sklearn.svm import SVC
+from sklearn.datasets import make_classification
 
 # TODO: add code from
 # https://github.com/YounessAzimzade/XML-TME-NAC-BC/blob/main/Discovery%20SVM.ipynb
@@ -84,3 +86,33 @@ def grid_search(X, y, n_cores = 1, verbose = 0):
     grid.fit(X_train, y_train)
 
     return grid
+
+def evaluate_model(X, y, verbose = False):
+    # We normally start with the model that has the best performance and
+    # fine tune the parameters to find the best model.
+    # Here, the following model found to have the best performance
+    # based on combined score
+
+    # Create model
+    model = SVC(
+        C = 1, gamma = 0.1, kernel = 'rbf', probability = True,
+        class_weight = 'balanced'
+    )
+
+    # It should be noted that SHAP values calculated using these two models are
+    # very similar, particularly for features with high correlation to response.
+
+    cv = KFold(n_splits=5, random_state=1, shuffle=True)
+
+    # evaluate model
+    Acc_score = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
+    f1_score = cross_val_score(model, X, y, scoring='f1', cv=cv, n_jobs=-1)
+    roc_auc = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
+
+    # report performance
+    if verbose:
+        print('Accuracy: %.3f (%.3f)' % (numpy.mean(Acc_score)*100, numpy.std(Acc_score)*100))
+        print('f1 score: %.3f (%.3f)' % (numpy.mean(f1_score), numpy.std(f1_score)))
+        print('AUC: %.3f (%.3f)' % (numpy.mean(roc_auc), numpy.std(roc_auc)))
+
+    return {'Accuracy': Acc_score, 'f1 score': f1_score, 'AUC': roc_auc}

From 28bd8541f55094c91ca0553d8afb9aadd7626b60 Mon Sep 17 00:00:00 2001
From: Waldir Leoncio <w.l.netto@medisin.uio.no>
Date: Wed, 31 Jul 2024 14:31:31 +0200
Subject: [PATCH 3/4] Added TODO

---
 tests/test_discovery_svm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_discovery_svm.py b/tests/test_discovery_svm.py
index 9ccd204..176784a 100644
--- a/tests/test_discovery_svm.py
+++ b/tests/test_discovery_svm.py
@@ -56,3 +56,5 @@ def test_preprocess(mock_read_csv, mock_data):
 
     X, y = discovery_svm.extract_features(data)
     assert X.shape == (100, 44)
+
+# TODO: add tests for SVM grid_search and evaluate_model

From 8784999b54d0ddf2398cbb65f620c18c79b4d8a3 Mon Sep 17 00:00:00 2001
From: Waldir Leoncio <w.l.netto@medisin.uio.no>
Date: Wed, 31 Jul 2024 15:32:52 +0200
Subject: [PATCH 4/4] Added unit tests for SVM (#1)

---
 Makefile                    |  3 +++
 pytest.ini                  |  2 ++
 tests/test_discovery_svm.py | 18 +++++++++++++++++-
 3 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 pytest.ini

diff --git a/Makefile b/Makefile
index a0ecbe0..27628ba 100644
--- a/Makefile
+++ b/Makefile
@@ -16,4 +16,7 @@ local-install:
 	$(ENV_PATH)pip install .
 
 test:
+	$(ENV_PATH)pytest -m "not slow"
+
+test-full:
 	$(ENV_PATH)pytest
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..caa99f0
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+markers = slow: marks tests as slow (deselect with '-m "not slow"')
diff --git a/tests/test_discovery_svm.py b/tests/test_discovery_svm.py
index 176784a..2d9bf09 100644
--- a/tests/test_discovery_svm.py
+++ b/tests/test_discovery_svm.py
@@ -57,4 +57,20 @@ def test_preprocess(mock_read_csv, mock_data):
     X, y = discovery_svm.extract_features(data)
     assert X.shape == (100, 44)
 
-# TODO: add tests for SVM grid_search and evaluate_model
+@pytest.mark.slow
+def test_grid_search():
+    X = np.random.randn(100, 44)
+    y = np.random.choice([0, 1], 100)
+    grid = discovery_svm.grid_search(X, y)
+    assert isinstance(grid, discovery_svm.GridSearchCV)
+    assert hasattr(grid, 'best_params_')
+    assert hasattr(grid, 'best_score_')
+
+def test_evaluate_model():
+    X = np.random.randn(100, 44)
+    y = np.random.choice([0, 1], 100)
+    stats = discovery_svm.evaluate_model(X, y)
+    assert isinstance(stats, tuple)
+    assert len(stats) == 3
+    for i in range(3):
+        assert len(stats[i]) == 5