Skip to content

Commit

Permalink
Update environment, fixing script 3,4
Browse files Browse the repository at this point in the history
  • Loading branch information
Piloxita committed Dec 8, 2024
1 parent 5253767 commit 8d24b52
Show file tree
Hide file tree
Showing 16 changed files with 190 additions and 152 deletions.
190 changes: 99 additions & 91 deletions conda-linux-64.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
services:
# run jupyter notebook inside jupyter
jupyter-notebook:
image: piloxita/dsci-522-2425-team35-heart_disease_diagnostic_machine:be7f85b
image: piloxita/dsci-522-2425-team35-heart_disease_diagnostic_machine:latest
ports:
- "9999:8888"
volumes:
Expand Down
5 changes: 5 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ dependencies:
- pandera=0.21.0
- six=1.16.0
- ucimlrepo=0.0.7
- altair=5.1.2
- click=8.1.7
- tabulate=0.9.0
- quarto=1.5.57
- pickle
- pip:
- deepchecks==0.18.1
- altair-ally==0.1.1
Expand Down
6 changes: 3 additions & 3 deletions heart_diagnostic_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3808,9 +3808,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:522-group-proj-2]",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "conda-env-522-group-proj-2-py"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -3822,7 +3822,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
"version": "3.11.11"
}
},
"nbformat": 4,
Expand Down
Binary file added pairwise_relationships.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added results/figures/categorical_distributions.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added results/figures/confusion_matrix.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added results/figures/correlation_matrix.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added results/figures/numeric_distributions.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added results/models/disease_pipeline.pickle
Binary file not shown.
11 changes: 11 additions & 0 deletions results/tables/cross_val_score.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
dummy,logreg,svc,logreg_bal,svc_bal
0.001,0.009,0.007,0.009,0.008
0.003,0.006,0.006,0.005,0.006
0.746,0.797,0.818,0.731,0.747
0.746,0.85,0.896,0.822,0.916
0.0,0.638,0.824,0.477,0.524
0.0,0.789,0.969,0.609,0.779
0.0,0.44,0.38,0.68,0.62
0.0,0.56,0.61,0.84,0.94
0.0,0.517,0.512,0.555,0.563
0.0,0.655,0.747,0.706,0.851
11 changes: 11 additions & 0 deletions results/tables/cross_val_std.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
dummy,logreg,svc,logreg_bal,svc_bal
0.0,0.001,0.0,0.0,0.001
0.001,0.001,0.0,0.0,0.0
0.004,0.063,0.036,0.035,0.099
0.001,0.012,0.015,0.02,0.01
0.0,0.159,0.182,0.056,0.154
0.0,0.032,0.018,0.031,0.034
0.0,0.167,0.084,0.179,0.11
0.0,0.038,0.065,0.038,0.022
0.0,0.167,0.094,0.09,0.13
0.0,0.033,0.047,0.031,0.014
4 changes: 4 additions & 0 deletions results/tables/model_metrics.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Metric,Train,Test
F1 Score,0.627,0.333
Recall,0.74,0.4
Accuracy,0.777,0.636
98 changes: 48 additions & 50 deletions scripts/3_eda.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,104 +2,102 @@
import os
import pandas as pd
import altair as alt
import aly
import altair_ally as aly

@click.command()
@click.option(
'--train_path',
'--train',
default='data/processed/train_df.csv',
type=click.Path(exists=True),
help='Path to the input training CSV file.'
)
@click.option(
'--test_path',
default='data/processed/test_df.csv',
type=click.Path(exists=True),
help='Path to the input testing CSV file.'
)
@click.option(
'--output_dir',
default='results/figures/',
'--write-to',
default='results',
type=click.Path(),
help='Directory where output figures will be saved.'
)
def main(train_path, test_path, output_dir):
def main(train, write_to):
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)
os.makedirs(os.path.join(write_to, "figures"), exist_ok=True)

# Load data
train_df = pd.read_csv(train_path, index_col=0)
test_df = pd.read_csv(test_path, index_col=0)
train_df = pd.read_csv(train)

# EDA Steps:
print(train_df.info())

# Clean column names
train_df.columns = train_df.columns.str.strip().str.lower().str.replace(" ", "_", regex=False)
test_df.columns = test_df.columns.str.strip().str.lower().str.replace(" ", "_", regex=False)

# Altair configurations
alt.data_transformers.enable('default', max_rows=None)
aly.alt.data_transformers.enable('vegafusion')

# Univariate distribution for quantitative variables
numeric_columns = [
'age_(in_years)',
'resting_blood_pressure_(in_mm_hg_on_admission_to_the_hospital)',
'serum_cholesterol_(in_mg/dl)',
'maximum_heart_rate_achieved',
'st_depression_induced_by_exercise_relative_to_rest',
'number_of_major_vessels_(0–3)_colored_by_fluoroscopy'
'Age (in years)',
'Resting blood pressure (in mm Hg on admission to the hospital)',
'Serum cholesterol (in mg/dl)',
'Maximum heart rate achieved',
'ST depression induced by exercise relative to rest',
'Number of major vessels (0–3) colored by fluoroscopy'
]

# Visualization for numeric columns
numeric_dist_plot = aly.dist(
train_df[numeric_columns + ['diagnosis_of_heart_disease']],
color='diagnosis_of_heart_disease'
train_df[numeric_columns + ['Diagnosis of heart disease']],
color='Diagnosis of heart disease'
)
numeric_dist_plot.save(f"{output_dir}/numeric_distributions.png")
numeric_dist_plot.save(os.path.join(write_to, "figures", "numeric_distributions.png"))

categorical_columns = [
'sex',
'chest_pain_type',
'fasting_blood_sugar_>_120_mg/dl',
'resting_electrocardiographic_results',
'exercise-induced_angina',
'slope_of_the_peak_exercise_st_segment',
'thalassemia'
'Sex',
'Chest pain type',
'Fasting blood sugar > 120 mg/dl',
'Resting electrocardiographic results',
'Exercise-induced angina',
'Slope of the peak exercise ST segment',
'Thalassemia'
]

# Visualize categorical variables
categorical_dist_plot = aly.dist(
train_df[categorical_columns + ['diagnosis_of_heart_disease']]
.assign(diagnosis_of_heart_disease=lambda x: x['diagnosis_of_heart_disease'].astype(object)),
train_df[categorical_columns + ['Diagnosis of heart disease']]
.assign(diagnosis_of_heart_disease=lambda x: x['Diagnosis of heart disease'].astype(object)),
dtype='object',
color='diagnosis_of_heart_disease'
color='Diagnosis of heart disease'
)
categorical_dist_plot.save(f"{output_dir}/categorical_distributions.png")
categorical_dist_plot.save(os.path.join(write_to, "figures", "categorical_distributions.png"))

# Pairwise correlations for the numeric variables
correlation_plot = aly.corr(train_df[numeric_columns])
correlation_plot.save(f"{output_dir}/correlation_matrix.png")
correlation_plot.save(os.path.join(write_to, "figures", "correlation_matrix.png"))

# Select numeric columns with at least one high correlation
columns_with_at_least_one_high_corr = [
'age_(in_years)',
'resting_blood_pressure_(in_mm_hg_on_admission_to_the_hospital)',
'serum_cholesterol_(in_mg/dl)',
'maximum_heart_rate_achieved',
'st_depression_induced_by_exercise_relative_to_rest',
'number_of_major_vessels_(0–3)_colored_by_fluoroscopy',
'diagnosis_of_heart_disease'
'Age (in years)',
'Resting blood pressure (in mm Hg on admission to the hospital)',
'Serum cholesterol (in mg/dl)',
'Maximum heart rate achieved',
'ST depression induced by exercise relative to rest',
'Number of major vessels (0–3) colored by fluoroscopy',
'Diagnosis of heart disease'
]

sample_size = min(len(train_df), 300)

pairwise_relationships_plot = aly.pair(
train_df[columns_with_at_least_one_high_corr].sample(sample_size),
color='diagnosis_of_heart_disease'
# Pairplot-like visualization
pairwise_plot = alt.Chart(train_df).mark_point().encode(
x=alt.X(alt.repeat("column"), type='quantitative'),
y=alt.Y(alt.repeat("row"), type='quantitative'),
color='Diagnosis of heart disease'
).properties(
width=150,
height=150
).repeat(
row=['Age', 'Resting BP', 'Cholesterol'],
column=['Age', 'Resting BP', 'Cholesterol']
)
pairwise_relationships_plot.save(f"{output_dir}/pairwise_relationships.png")

pairwise_plot.save('pairwise_relationships.png')


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion scripts/4_training_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def main(train, write_to):
train_data = pd.read_csv(train)

# Split data into features and labels
X_train, y_train = train_data.drop(columns='target'), train_data['target']
X_train, y_train = train_data.drop(columns='Diagnosis of heart disease'), train_data['Diagnosis of heart disease']

# 1. DATA PREPROCESSOR
categorical_features = [
Expand Down
13 changes: 7 additions & 6 deletions scripts/5_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ def main(train, test, write_to):
test_data = pd.read_csv(test)

# Split data into features and labels
X_train, y_train = train_data.drop(columns='target'), train_data['target']
X_test, y_test = test_data.drop(columns='target'), test_data['target']
X_train, y_train = train_data.drop(columns='Diagnosis of heart disease'), train_data['Diagnosis of heart disease']
X_test, y_test = test_data.drop(columns='Diagnosis of heart disease'), test_data['Diagnosis of heart disease']

# Load the saved best model
with open(model_path, 'rb') as f:
Expand All @@ -58,16 +58,17 @@ def main(train, test, write_to):
metrics_df = pd.DataFrame({
'Metric': ['F1 Score', 'Recall', 'Accuracy'],
'Train': [
f1_score(y_train, train_predictions, pos_label='> 50% diameter narrowing'),
recall_score(y_train, train_predictions, pos_label='> 50% diameter narrowing'),
f1_score(y_train, train_predictions, average='binary', pos_label='> 50% diameter narrowing'),
recall_score(y_train, train_predictions, average='binary', pos_label='> 50% diameter narrowing'),
best_model.score(X_train, y_train),
],
'Test': [
f1_score(y_test, test_predictions, pos_label='> 50% diameter narrowing'),
recall_score(y_test, test_predictions, pos_label='> 50% diameter narrowing'),
f1_score(y_test, test_predictions, average='binary', pos_label='> 50% diameter narrowing'),
recall_score(y_test, test_predictions, average='binary', pos_label='> 50% diameter narrowing'),
best_model.score(X_test, y_test),
],
})
metrics_df = metrics_df.round(3)
metrics_df.to_csv(os.path.join(write_to, "tables", "model_metrics.csv"), index=False)

# Save confusion matrix
Expand Down

0 comments on commit 8d24b52

Please sign in to comment.