Skip to content

Commit

Permalink
Merge branch 'issue-1' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
wleoncio committed Aug 22, 2024
2 parents f9f2bec + fb268ae commit fb9fbfb
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 22 deletions.
2 changes: 1 addition & 1 deletion src/pCRscore/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@


# Load modules
from . import discovery_svm
from . import svm
Empty file added src/pCRscore/common.py
Empty file.
29 changes: 19 additions & 10 deletions src/pCRscore/discovery_svm.py → src/pCRscore/svm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sklearn.svm import SVC
from sklearn.datasets import make_classification

def preprocess(data):
def preprocess(data, svm_type = "discovery"):
# Mapping the values in the 'Response' column to binary values 0 and 1
resp = {'pCR': 1, 'RD': 0}
data.Response = [resp[item] for item in data.Response]
Expand All @@ -22,10 +22,19 @@ def preprocess(data):
data = pandas.get_dummies(data, columns=categorical_cols)

# Selecting validation cohort data
valid_cohort = [
'E-MTAB-4439', 'GSE18728', 'GSE19697', 'GSE20194', 'GSE20271',
'GSE22093', 'GSE22358', 'GSE42822', 'GSE22513'
]
if svm_type == "discovery":
valid_cohort = [
'E-MTAB-4439', 'GSE18728', 'GSE19697', 'GSE20194', 'GSE20271',
'GSE22093', 'GSE22358', 'GSE42822', 'GSE22513'
]
elif svm_type == "validation":
valid_cohort = [
'GSE25066', 'GSE32603', 'GSE32646', 'GSE37946', 'GSE50948',
'GSE23988'
]
else:
raise ValueError("Invalid SVM type. Choose 'discovery' or 'validation'")

data = data[data['Trial'].isin(valid_cohort)]

return data
Expand All @@ -37,7 +46,7 @@ def extract_features(data):
X = data.drop(dropped_columns, axis = 1)
d3 = data.drop(dropped_columns, axis = 1)

# P# Extract the target variable 'y' (dependent variable)
# Extract the target variable 'y' (dependent variable)
y = data['Response']

# Standardize the features using the StandardScaler from sklearn
Expand All @@ -51,7 +60,7 @@ def extract_features(data):

return X, y

def grid_search(X, y, n_cores = 1, verbose = 0):
def grid_search(X, y, n_cores = -2, verbose = 0):
# Defining the parameter range for the hyperparameter grid search
param_grid = {
'C': numpy.exp(numpy.linspace(-12, 3, num = 50)),
Expand Down Expand Up @@ -100,9 +109,9 @@ def evaluate_model(X, y, verbose = False):
cv = KFold(n_splits=5, random_state=1, shuffle=True)

# evaluate model
Acc_score = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
f1_score = cross_val_score(model, X, y, scoring='f1', cv=cv, n_jobs=-1)
roc_auc = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
Acc_score = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-2)
f1_score = cross_val_score(model, X, y, scoring='f1', cv=cv, n_jobs=-2)
roc_auc = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-2)

# report performance
if verbose:
Expand Down
26 changes: 15 additions & 11 deletions tests/test_discovery_svm.py → tests/test_svm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from pCRscore import discovery_svm
from pCRscore import svm
import pandas as pd
from unittest import mock
import pytest
Expand Down Expand Up @@ -50,26 +50,30 @@ def test_preprocess(mock_read_csv, mock_data):
# Configure the mock to return your predefined DataFrame
mock_read_csv.return_value = mock_data

data = pd.read_csv("Data NAC cohort _1_.csv") # returns mock data instead
data = discovery_svm.preprocess(data)
assert data.shape == (100, 48)
data_disc = pd.read_csv("Data NAC cohort _1_.csv") # returns mock data instead
data_valid = data_disc.copy()
data_disc = svm.preprocess(data_disc)
data_valid['Trial'] = 'GSE25066'
data_valid = svm.preprocess(data_valid, svm_type = "validation")

X, y = discovery_svm.extract_features(data)
assert X.shape == (100, 44)
for dt in [data_disc, data_valid]:
assert dt.shape == (100, 48)
X, y = svm.extract_features(dt)
assert X.shape == (100, 44)

@pytest.mark.slow
def test_grid_search():
X = pd.DataFrame(np.random.randn(100, 44))
y = np.random.choice([0, 1], 100)
grid = discovery_svm.grid_search(X, y)
assert isinstance(grid, discovery_svm.GridSearchCV)
grid = svm.grid_search(X, y, n_cores = -2)
assert isinstance(grid, svm.GridSearchCV)
assert hasattr(grid, 'best_params_')
assert hasattr(grid, 'best_score_')

def test_evaluate_model():
X = np.random.randn(100, 44)
y = np.random.choice([0, 1], 100)
stats = discovery_svm.evaluate_model(X, y)
stats = svm.evaluate_model(X, y)
assert isinstance(stats, dict)
assert len(stats) == 3
for i in stats:
Expand All @@ -78,7 +82,7 @@ def test_evaluate_model():
def test_shapley():
X = pd.DataFrame(np.random.randn(100, 44))
y = np.random.choice([0, 1], 100)
shapl = discovery_svm.shap_analysis(X, y)
shapl = svm.shap_analysis(X, y)
assert isinstance(shapl, np.ndarray)
assert shapl.shape == (100, 44)
discovery_svm.shap_plot(shapl, X)
svm.shap_plot(shapl, X)

0 comments on commit fb9fbfb

Please sign in to comment.