Skip to content

Commit

Permalink
Merge branch 'issue-1' into develop
Browse files Browse the repository at this point in the history
* issue-1:
  Added `combine_discovery_validation()` tests (#1)
  Added `combine_discovery_validation()` (#1)
  Added `*valid*` datasets (#1)
  Renamed more datasets in test
  Deflaked file
  Renamed data on test
  Added test to `fit_line()` (#1)
  Added `fit_line()` (#1)
  • Loading branch information
wleoncio committed Jan 14, 2025
2 parents 80a848d + 2ff0e2d commit 5228d86
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 16 deletions.
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ description = "Predicts patholodical Complete Response scores"
readme = "README.md"
requires-python = ">= 3.8"
dependencies = [
"pandas", "numpy", "seaborn", "matplotlib", "scikit-learn", "shap"
"pandas", "numpy", "seaborn", "matplotlib", "scikit-learn", "shap",
"statsmodels"
]
classifiers = [
"Programming Language :: Python :: 3",
Expand Down
34 changes: 34 additions & 0 deletions src/pCRscore/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pandas
import statsmodels.api


# Function to drop non-float columns
Expand Down Expand Up @@ -26,3 +27,36 @@ def combine_fractions_shap(data_norm, shap):
all_pat_1.columns = ['Feature', 'Fraction', 'SHAP value']
all_pat_1_clean = all_pat_1.dropna()
return all_pat_1_clean


# Function to fit a line to SHAP vs Fraction for each cell type
def fit_line(data):
result = []
grouped = data.groupby('Feature')
for name, group in grouped:
X = statsmodels.api.add_constant(group['Fraction'])
y = group['SHAP value']
model = statsmodels.api.OLS(y, X).fit()
coef = model.params['Fraction']
ci = model.conf_int(alpha=0.001).loc['Fraction']
result.append({'Feature': name, 'Coef': coef, 'CI': ci[0] * ci[1]})
return pandas.DataFrame(result)


def combine_discovery_validation(data_disc, data_valid, fit_disc, fit_valid):
# Combine fit results
fit_combined = pandas.merge(
fit_valid, fit_disc, on="Feature", suffixes=("_Valid", "_Discv")
)
# Select cell types that show a clear association in both cohorts
fit_combined = fit_combined[
(fit_combined['Coef_Discv'] * fit_combined['Coef_Valid'] > 0) &
(fit_combined['CI_Valid'] > 0) &
(fit_combined['CI_Discv'] > 0)
]
# All data put together
all_pat = pandas.concat([data_disc, data_valid], axis=0)

# Select only the cell types that pass the validation
all_pat = all_pat[all_pat['Feature'].isin(fit_combined['Feature'])]
return all_pat
66 changes: 51 additions & 15 deletions tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,47 +50,83 @@ def mock_data(shap=False):
if os.path.exists(".meta"):
# Running locally
local = True
data_raw = pandas.read_csv(".meta/DiscoveryData.csv")
shap_raw = pandas.read_csv(".meta/DiscoverySHAP.csv")
disc_data_raw = pandas.read_csv(".meta/DiscoveryData.csv")
disc_shap_raw = pandas.read_csv(".meta/DiscoverySHAP.csv")
valid_data_raw = pandas.read_csv(".meta/ValidationData.csv")
valid_shap_raw = pandas.read_csv(".meta/ValidationSHAP.csv")
else:
# On GitHub Actions
local = False
data_raw = mock_data()
shap_raw = mock_data(shap=True)
disc_data_raw = mock_data()
disc_shap_raw = mock_data(shap=True)


data = pipeline.drop_non_float(data_raw)
shap = pipeline.drop_non_float(shap_raw, extra_cols=range(36, 42))
disc_data = pipeline.drop_non_float(disc_data_raw)
disc_shap = pipeline.drop_non_float(disc_shap_raw, extra_cols=range(36, 42))
valid_data = pipeline.drop_non_float(valid_data_raw)
valid_shap = pipeline.drop_non_float(valid_shap_raw, extra_cols=range(36, 42))


def test_drop_non_float_and_unnamed():
assert data.shape == data_raw.iloc[:, 3:39].shape
assert shap.shape == shap_raw.iloc[:, 1:37].shape
assert disc_data.shape == disc_data_raw.iloc[:, 3:39].shape
assert disc_shap.shape == disc_shap_raw.iloc[:, 1:37].shape


# Normalize data
data_norm = pipeline.normalize_data(data.copy())
disc_data_norm = pipeline.normalize_data(disc_data.copy())
valid_data_norm = pipeline.normalize_data(valid_data.copy())


# Test
def test_normalize_data():
assert data_norm.shape == data.shape
assert disc_data_norm.shape == disc_data.shape
if local:
tol = 1e-3
assert math.isclose(
data['B.cells.Memory'].iloc[0], 0.01789, rel_tol=tol
disc_data['B.cells.Memory'].iloc[0], 0.01789, rel_tol=tol
)
assert math.isclose(
data_norm['B.cells.Memory'].iloc[0], 0.34186, rel_tol=tol
disc_data_norm['B.cells.Memory'].iloc[0], 0.34186, rel_tol=tol
)
assert math.isclose(
shap['Endothelials'].iloc[0], 0.037792, rel_tol=tol
disc_shap['Endothelials'].iloc[0], 0.037792, rel_tol=tol
)


# Combine data and SHAP values
data_shap = pipeline.combine_fractions_shap(data_norm, shap)
disc_data_shap = pipeline.combine_fractions_shap(disc_data_norm, disc_shap)
valid_data_shap = pipeline.combine_fractions_shap(valid_data_norm, valid_shap)


def test_combine_fractions_shape():
assert data_shap.shape == (35928, 3)
assert disc_data_shap.shape == (35928, 3)


# Fit lines for Discovery and Validation cohorts
fit_disc = pipeline.fit_line(disc_data_shap)
fit_valid = pipeline.fit_line(valid_data_shap)


def test_fit_line():
assert fit_disc.shape[0] == len(disc_data_shap['Feature'].unique())
assert 'Feature' in fit_disc.columns
assert 'Coef' in fit_disc.columns
assert 'CI' in fit_disc.columns
if local:
line = fit_disc['Feature'] == 'B.cells.Memory'
assert math.isclose(
fit_disc.loc[line, 'Coef'].values[0], -0.0198, rel_tol=1e-2
)
assert math.isclose(
fit_disc.loc[line, 'CI'].values[0], 0.0000193, rel_tol=1e-2
)

all_pat = pipeline.combine_discovery_validation(
disc_data_shap, valid_data_shap, fit_disc, fit_valid
)
def test_combine_discovery_validation():
assert 'Feature' in all_pat.columns
assert 'Fraction' in all_pat.columns
assert 'SHAP value' in all_pat.columns
if local:
all_pat.shape == (15888, 3)

0 comments on commit 5228d86

Please sign in to comment.