From e3ece58d36db4c12f030c5ea0b8f7525477c1f3c Mon Sep 17 00:00:00 2001 From: Waldir Leoncio Date: Wed, 31 Jul 2024 13:24:53 +0200 Subject: [PATCH 1/4] Added `verbose` to `grid_search()` --- src/pCRscore/discovery_svm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pCRscore/discovery_svm.py b/src/pCRscore/discovery_svm.py index 60c835c..2ad53aa 100644 --- a/src/pCRscore/discovery_svm.py +++ b/src/pCRscore/discovery_svm.py @@ -51,8 +51,7 @@ def extract_features(data): return X, y -def grid_search(X, y, n_cores = 1): - # TODO: profile (with cProfile?) to possibly add progress bar +def grid_search(X, y, n_cores = 1, verbose = 0): # Defining the parameter range for the hyperparameter grid search param_grid = { 'C': numpy.exp(numpy.linspace(-12, 3, num = 50)), @@ -77,7 +76,8 @@ def grid_search(X, y, n_cores = 1): # no verbosity grid = GridSearchCV( SVC(class_weight='balanced'), - param_grid, scoring = scoring, refit = 'F1', cv = 10, n_jobs = n_cores + param_grid, scoring = scoring, refit = 'F1', cv = 10, n_jobs = n_cores, + verbose = verbose ) # Fit the model for grid search using the training data From aec9b5aa57b23c8f0f48a499cf2e733eb96478e5 Mon Sep 17 00:00:00 2001 From: Waldir Leoncio Date: Wed, 31 Jul 2024 13:25:19 +0200 Subject: [PATCH 2/4] Added `evaluate_model()` (#1) --- src/pCRscore/discovery_svm.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/src/pCRscore/discovery_svm.py b/src/pCRscore/discovery_svm.py index 2ad53aa..1c1096f 100644 --- a/src/pCRscore/discovery_svm.py +++ b/src/pCRscore/discovery_svm.py @@ -2,8 +2,10 @@ import numpy from sklearn import preprocessing from sklearn.metrics import make_scorer, f1_score, accuracy_score -from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold +from sklearn.model_selection import \ + GridSearchCV, train_test_split, StratifiedKFold, KFold, cross_val_score from sklearn.svm import SVC +from sklearn.datasets import make_classification # TODO: add code from # https://github.com/YounessAzimzade/XML-TME-NAC-BC/blob/main/Discovery%20SVM.ipynb @@ -84,3 +86,33 @@ def grid_search(X, y, n_cores = 1, verbose = 0): grid.fit(X_train, y_train) return grid + +def evaluate_model(X, y, verbose = False): + # We normally start with the model that has the best performance and + # fine tune the parameters to find the best model. + # Here, the following model found to have the best performance + # based on combined score + + # Create model + model = SVC( + C = 1, gamma = 0.1, kernel = 'rbf', probability = True, + class_weight = 'balanced' + ) + + # It should be noted that SHAP values calculated using these two models are + # very similar, particularly for features with high correlation to response. + + cv = KFold(n_splits=5, random_state=1, shuffle=True) + + # evaluate model + Acc_score = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1) + f1_score = cross_val_score(model, X, y, scoring='f1', cv=cv, n_jobs=-1) + roc_auc = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1) + + # report performance + if verbose: + print('Accuracy: %.3f (%.3f)' % (numpy.mean(Acc_score)*100, numpy.std(Acc_score)*100)) + print('f1 score: %.3f (%.3f)' % (numpy.mean(f1_score), numpy.std(f1_score))) + print('AUC: %.3f (%.3f)' % (numpy.mean(roc_auc), numpy.std(roc_auc))) + + return {'Accuracy': Acc_score, 'f1 score': f1_score, 'AUC': roc_auc} From 28bd8541f55094c91ca0553d8afb9aadd7626b60 Mon Sep 17 00:00:00 2001 From: Waldir Leoncio Date: Wed, 31 Jul 2024 14:31:31 +0200 Subject: [PATCH 3/4] Added TODO --- tests/test_discovery_svm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_discovery_svm.py b/tests/test_discovery_svm.py index 9ccd204..176784a 100644 --- a/tests/test_discovery_svm.py +++ b/tests/test_discovery_svm.py @@ -56,3 +56,5 @@ def test_preprocess(mock_read_csv, mock_data): X, y = discovery_svm.extract_features(data) assert X.shape == (100, 44) + +# TODO: add tests for SVM grid_search and evaluate_model From 8784999b54d0ddf2398cbb65f620c18c79b4d8a3 Mon Sep 17 00:00:00 2001 From: Waldir Leoncio Date: Wed, 31 Jul 2024 15:32:52 +0200 Subject: [PATCH 4/4] Added unit tests for SVM (#1) --- Makefile | 3 +++ pytest.ini | 2 ++ tests/test_discovery_svm.py | 18 +++++++++++++++++- 3 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 pytest.ini diff --git a/Makefile b/Makefile index a0ecbe0..27628ba 100644 --- a/Makefile +++ b/Makefile @@ -16,4 +16,7 @@ local-install: $(ENV_PATH)pip install . test: + $(ENV_PATH)pytest -m "not slow" + +test-full: $(ENV_PATH)pytest diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..caa99f0 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +markers = slow: marks tests as slow (deselect with '-m "not slow"') diff --git a/tests/test_discovery_svm.py b/tests/test_discovery_svm.py index 176784a..2d9bf09 100644 --- a/tests/test_discovery_svm.py +++ b/tests/test_discovery_svm.py @@ -57,4 +57,20 @@ def test_preprocess(mock_read_csv, mock_data): X, y = discovery_svm.extract_features(data) assert X.shape == (100, 44) -# TODO: add tests for SVM grid_search and evaluate_model +@pytest.mark.slow +def test_grid_search(): + X = np.random.randn(100, 44) + y = np.random.choice([0, 1], 100) + grid = discovery_svm.grid_search(X, y) + assert isinstance(grid, discovery_svm.GridSearchCV) + assert hasattr(grid, 'best_params_') + assert hasattr(grid, 'best_score_') + +def test_evaluate_model(): + X = np.random.randn(100, 44) + y = np.random.choice([0, 1], 100) + stats = discovery_svm.evaluate_model(X, y) + assert isinstance(stats, tuple) + assert len(stats) == 3 + for i in range(3): + assert len(stats[i]) == 5