template MLOps

LaErre9 · Sep 3, 2021 · 7ac8a22 · 7ac8a22 · github-actions · Sep 3, 2021
1 parent 4c6096d
commit 7ac8a22
Show file tree

Hide file tree

Showing 21 changed files with 5,743 additions and 5,625 deletions.
diff --git a/.github/workflows/cml.yaml b/.github/workflows/cml.yaml
@@ -16,10 +16,10 @@ jobs:
           python myTraining.py
 
           echo "## Model score" > report.md
-          cat score.txt >> report.md
+          cat results/score_monitoring.txt >> report.md
 
           echo "## Data visual" >> report.md
-          cml-publish report.png --md >> report.md
+          cml-publish results/report_threshold.png --md >> report.md
           
           cml-send-comment report.md
 

diff --git a/data.csv → data/data.csv b/data.csv → data/data.csv
diff --git a/data/data_preparation/data_preparation.py b/data/data_preparation/data_preparation.py
@@ -0,0 +1,107 @@
+# Librerie utili per l'analisi dei dati
+import pandas as pd
+import numpy as np
+import matplotlib
+import seaborn as sns
+import matplotlib.pyplot as plt
+from numpy.random.mtrand import seed
+
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.metrics import mean_squared_error, r2_score
+from sklearn import tree
+
+# Configurazione dello stile dei grafici
+sns.set(context='notebook', style='darkgrid', palette='colorblind', font='sans-serif', font_scale=1, rc=None)
+matplotlib.rcParams['figure.figsize'] =[8,8]
+matplotlib.rcParams.update({'font.size': 15})
+matplotlib.rcParams['font.family'] = 'sans-serif'
+
+# --- Preparazione dei dati ---
+# Caricamento del dataset
+
+def data_split(data, ratio):
+    np.random.seed(42)
+    shuffled = np.random.permutation(len(data))
+    test_set_size = int(len(data) * ratio)
+    test_indices = shuffled[:test_set_size]
+    train_indices = shuffled[test_set_size:]
+    return data.iloc[train_indices], data.iloc[test_indices]
+
+if __name__== "__main__":
+
+    # Read The Data
+    covid = pd.read_csv('data/data.csv')
+    train, test = data_split(covid, 0.2)
+    X_train = train[['Breathing Problem', 'Fever', 'Dry Cough', 'Sore throat',
+       'Running Nose', 'Asthma', 'Chronic Lung Disease', 'Headache',
+       'Heart Disease', 'Diabetes', 'Hyper Tension', 'Fatigue ',
+       'Gastrointestinal ', 'Abroad travel', 'Contact with COVID Patient',
+       'Attended Large Gathering', 'Visited Public Exposed Places',
+       'Family working in Public Exposed Places', 'Wearing Masks',
+       'Sanitization from Market']].to_numpy()
+
+    X_test = test[['Breathing Problem', 'Fever', 'Dry Cough', 'Sore throat',
+       'Running Nose', 'Asthma', 'Chronic Lung Disease', 'Headache',
+       'Heart Disease', 'Diabetes', 'Hyper Tension', 'Fatigue ',
+       'Gastrointestinal ', 'Abroad travel', 'Contact with COVID Patient',
+       'Attended Large Gathering', 'Visited Public Exposed Places',
+       'Family working in Public Exposed Places', 'Wearing Masks',
+       'Sanitization from Market']].to_numpy()
+
+    Y_train = train[['COVID-19']].to_numpy().reshape(4348,)
+    Y_test = test[['COVID-19']].to_numpy().reshape(1086,)
+
+# Questo metodo stampa le informazioni su un DataFrame, inclusi l'indice dtype e le colonne, i valori non null e l'utilizzo della memoria.
+# Scrittura delle info su file.txt
+with open('data/data_preparation/info.txt', 'w') as f:
+    covid.info(buf=f)
+
+
+# Verifica dei dati mancanti
+missing_values=covid.isnull().sum() # valori mancanti
+percent_missing = covid.isnull().sum()/covid.shape[0]*100 # valori mancanti %
+value = {
+    'missing_values ':missing_values,
+    'percent_missing %':percent_missing  
+}
+frame=pd.DataFrame(value)
+frame.to_csv('data/data_preparation/missing_value.csv') # salvataggio su un file.csv per renderlo leggibile
+
+
+# Genera statistiche descrittive
+covid.describe().to_csv("data/data_preparation/dataset_statics.csv") # salvataggio su un file.csv per renderlo leggibile
+
+# --- Visualizzazione dei dati ---
+# COVID-19
+# sns_plot = sns.countplot(x='COVID-19', data=covid)
+# figure = sns_plot.get_figure()
+# figure.savefig('data/data_preparation/data_viz/COVID-19.png', dpi = 400)
+
+# Breathing Problem 
+# sns_breathing = sns.countplot(x='Breathing Problem',hue='COVID-19',data=covid)
+# figure1 = sns_breathing.get_figure()
+# figure1.savefig('data/data_preparation/data_viz/BreathingProblem.png', dpi = 400)
+
+# Fever 
+# sns_fever = sns.countplot(x='Fever', hue='COVID-19', data=covid)
+# figure2 = sns_fever.get_figure()
+# figure2.savefig('data/data_preparation/data_viz/Fever.png', dpi = 400)
+
+# Dry Cough
+# sns_dry = sns.countplot(x='Dry Cough',hue='COVID-19',data=covid)
+# figure3 = sns_dry.get_figure()
+# figure3.savefig('data/data_preparation/data_viz/dry.png', dpi = 400)
+
+# Sore Throat
+# sns_sore = sns.countplot(x='Sore throat',hue='COVID-19',data=covid)
+# figure4 = sns_sore.get_figure()
+# figure4.savefig('data/data_preparation/data_viz/sore.png', dpi = 400)
+
+
+
+
+
diff --git a/data/data_preparation/data_viz/BreathingProblem.png b/data/data_preparation/data_viz/BreathingProblem.png
diff --git a/data/data_preparation/data_viz/COVID-19.png b/data/data_preparation/data_viz/COVID-19.png
diff --git a/data/data_preparation/data_viz/Fever.png b/data/data_preparation/data_viz/Fever.png
diff --git a/data/data_preparation/data_viz/dry.png b/data/data_preparation/data_viz/dry.png
diff --git a/data/data_preparation/data_viz/sore.png b/data/data_preparation/data_viz/sore.png
diff --git a/dataset_statics.csv → data/data_preparation/dataset_statics.csv b/dataset_statics.csv → data/data_preparation/dataset_statics.csv
@@ -1,9 +1,9 @@
-,Breathing Problem,Fever,Dry Cough,Sore throat,Running Nose,Asthma,Chronic Lung Disease,Headache,Heart Disease,Diabetes,Hyper Tension,Fatigue ,Gastrointestinal ,Abroad travel,Contact with COVID Patient,Attended Large Gathering,Visited Public Exposed Places,Family working in Public Exposed Places,Wearing Masks,Sanitization from Market,COVID-19
-count,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0
-mean,0.6661759293338241,0.7863452337136547,0.7926021347073978,0.7274567537725433,0.5432462274567538,0.4626426205373574,0.47202797202797203,0.5034965034965035,0.4642988590357011,0.4762605815237394,0.49006256900993744,0.5191387559808612,0.4694516010305484,0.45104895104895104,0.5016562384983437,0.4619065145380935,0.5189547294810453,0.41626794258373206,0.0,0.0,0.8065881486934119
-std,0.4716211327739574,0.4099235665965471,0.40548026751388566,0.44530878904756294,0.49817209342489355,0.4986483574853146,0.4992628934027898,0.5000337861645077,0.4987696953792399,0.4994820831364206,0.4999472415983911,0.49967955300634165,0.4991118498366407,0.49764381725600054,0.5000432695898289,0.4985926537423099,0.4996865689546946,0.49298444924353374,0.0,0.0,0.39500939378839356
-min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-25%,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
-50%,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
-75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
-max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
+,Breathing Problem,Fever,Dry Cough,Sore throat,Running Nose,Asthma,Chronic Lung Disease,Headache,Heart Disease,Diabetes,Hyper Tension,Fatigue ,Gastrointestinal ,Abroad travel,Contact with COVID Patient,Attended Large Gathering,Visited Public Exposed Places,Family working in Public Exposed Places,Wearing Masks,Sanitization from Market,COVID-19
+count,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0
+mean,0.6661759293338241,0.7863452337136547,0.7926021347073978,0.7274567537725433,0.5432462274567538,0.4626426205373574,0.47202797202797203,0.5034965034965035,0.4642988590357011,0.4762605815237394,0.49006256900993744,0.5191387559808612,0.4694516010305484,0.45104895104895104,0.5016562384983437,0.4619065145380935,0.5189547294810453,0.41626794258373206,0.0,0.0,0.8065881486934119
+std,0.4716211327739574,0.4099235665965471,0.40548026751388566,0.44530878904756294,0.49817209342489355,0.4986483574853146,0.4992628934027898,0.5000337861645077,0.4987696953792399,0.4994820831364206,0.4999472415983911,0.49967955300634165,0.4991118498366407,0.49764381725600054,0.5000432695898289,0.4985926537423099,0.4996865689546946,0.49298444924353374,0.0,0.0,0.39500939378839356
+min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
+25%,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
+50%,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
+75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
+max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
diff --git a/data/data_preparation/info.txt b/data/data_preparation/info.txt
@@ -0,0 +1,28 @@
+<class 'pandas.core.frame.DataFrame'>
+RangeIndex: 5434 entries, 0 to 5433
+Data columns (total 21 columns):
+ #   Column                                   Non-Null Count  Dtype
+---  ------                                   --------------  -----
+ 0   Breathing Problem                        5434 non-null   int64
+ 1   Fever                                    5434 non-null   int64
+ 2   Dry Cough                                5434 non-null   int64
+ 3   Sore throat                              5434 non-null   int64
+ 4   Running Nose                             5434 non-null   int64
+ 5   Asthma                                   5434 non-null   int64
+ 6   Chronic Lung Disease                     5434 non-null   int64
+ 7   Headache                                 5434 non-null   int64
+ 8   Heart Disease                            5434 non-null   int64
+ 9   Diabetes                                 5434 non-null   int64
+ 10  Hyper Tension                            5434 non-null   int64
+ 11  Fatigue                                  5434 non-null   int64
+ 12  Gastrointestinal                         5434 non-null   int64
+ 13  Abroad travel                            5434 non-null   int64
+ 14  Contact with COVID Patient               5434 non-null   int64
+ 15  Attended Large Gathering                 5434 non-null   int64
+ 16  Visited Public Exposed Places            5434 non-null   int64
+ 17  Family working in Public Exposed Places  5434 non-null   int64
+ 18  Wearing Masks                            5434 non-null   int64
+ 19  Sanitization from Market                 5434 non-null   int64
+ 20  COVID-19                                 5434 non-null   int64
+dtypes: int64(21)
+memory usage: 891.6 KB
diff --git a/Missingvalue.csv → data/data_preparation/missing_value.csv b/Missingvalue.csv → data/data_preparation/missing_value.csv
@@ -1,22 +1,22 @@
-,missing_values ,percent_missing %
-Breathing Problem,0,0.0
-Fever,0,0.0
-Dry Cough,0,0.0
-Sore throat,0,0.0
-Running Nose,0,0.0
-Asthma,0,0.0
-Chronic Lung Disease,0,0.0
-Headache,0,0.0
-Heart Disease,0,0.0
-Diabetes,0,0.0
-Hyper Tension,0,0.0
-Fatigue ,0,0.0
-Gastrointestinal ,0,0.0
-Abroad travel,0,0.0
-Contact with COVID Patient,0,0.0
-Attended Large Gathering,0,0.0
-Visited Public Exposed Places,0,0.0
-Family working in Public Exposed Places,0,0.0
-Wearing Masks,0,0.0
-Sanitization from Market,0,0.0
-COVID-19,0,0.0
+,missing_values ,percent_missing %
+Breathing Problem,0,0.0
+Fever,0,0.0
+Dry Cough,0,0.0
+Sore throat,0,0.0
+Running Nose,0,0.0
+Asthma,0,0.0
+Chronic Lung Disease,0,0.0
+Headache,0,0.0
+Heart Disease,0,0.0
+Diabetes,0,0.0
+Hyper Tension,0,0.0
+Fatigue ,0,0.0
+Gastrointestinal ,0,0.0
+Abroad travel,0,0.0
+Contact with COVID Patient,0,0.0
+Attended Large Gathering,0,0.0
+Visited Public Exposed Places,0,0.0
+Family working in Public Exposed Places,0,0.0
+Wearing Masks,0,0.0
+Sanitization from Market,0,0.0
+COVID-19,0,0.0
diff --git a/dvc.lock b/dvc.lock
@@ -3,13 +3,13 @@ stages:
   train:
     cmd: python myTraining.py
     deps:
-    - path: data.csv
+    - path: data/data.csv
       md5: 02b3a4aa918c639fb02b6d3cd98914c4
       size: 234007
     - path: myTraining.py
       md5: d8452f7c93722709dd9bf2069c4ae3ee
       size: 4770
     outs:
-    - path: metrics.json
+    - path: results/metrics.json
       md5: bb7d10954900192c72c9dd340b3dc570
       size: 131
diff --git a/dvc.yaml b/dvc.yaml
@@ -3,7 +3,7 @@ stages:
     cmd: python myTraining.py
     deps:
     - myTraining.py
-    - data.csv
+    - data/data.csv
     metrics:
-    - metrics.json:
+    - results/metrics.json:
         cache: false
Path	Metric	Old	New	Change
results/metrics.json	accuracy	0.97238	—	—
results/metrics.json	precision	0.97547	—	—
results/metrics.json	sensitivity	0.99094	—	—
results/metrics.json	specificity	0.89163	—	—