refactor: extract split tests and graphs (merge #23)

- refactor: separate split tests - refactor: extract utils from `knn`, `lda`, `lr` - refactor: extract graphs and splits in svm and plsda
f-aguzzi · May 20, 2024 · 6865c88 · 6865c88
1 parent 0b13ed3
commit 6865c88
Show file tree

Hide file tree

Showing 15 changed files with 259 additions and 394 deletions.
diff --git a/chemfusekit/__init__.py b/chemfusekit/__init__.py
@@ -1,4 +1,4 @@
 '''__init__.py file for the library'''
 from beartype import BeartypeConf
 from beartype.claw import beartype_this_package
-beartype_this_package(conf=BeartypeConf(violation_type=TypeError))     
+beartype_this_package(conf=BeartypeConf(violation_type=TypeError))
diff --git a/chemfusekit/__utils.py b/chemfusekit/__utils.py
@@ -0,0 +1,115 @@
+'''Utilities model: functions that are shared between different classes'''
+from sklearn.cross_decomposition import PLSRegression as PLSR
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import confusion_matrix, classification_report
+
+import numpy as np
+
+import plotly.express as px
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+def graph_output(scores, model, name: str):
+    '''A reusable graphing function.'''
+    print(scores)
+    print(f"""
+        explained variance ratio with {name}:
+        {model.explained_variance_ratio_}
+    """)
+
+    # Display the explained variance ratio
+    print("Explained Variance Ratio:", model.explained_variance_ratio_)
+
+    #Scores plot
+    fig = px.scatter(scores, x="LV1", y="LV2", color="Substance", hover_data=['Substance'])
+    fig.update_xaxes(zeroline=True, zerolinewidth=1, zerolinecolor='Black')
+    fig.update_yaxes(zeroline=True, zerolinewidth=1, zerolinecolor='Black')
+    fig.update_layout(
+        height=600,
+        width=800,
+        title_text='Scores Plot')
+    fig.show()
+
+    # Plot 3D scores
+    fig = px.scatter_3d(scores, x='LV1', y='LV2', z='LV3',
+                        color='Substance', hover_data=['Substance'],
+                        hover_name=scores.index
+    )
+    fig.update_layout(title_text=f"3D colored by Substance for {name}")
+    fig.show()
+
+def run_split_test(x, y, model, extended=False):
+    '''A function to run split tests on trained models.'''
+    x_train, x_test, y_train, y_test = train_test_split(
+        x,
+        y,
+        test_size=0.3,
+        random_state=42,
+        shuffle=True,
+        stratify=y
+    )
+
+    model.fit(x_train, y_train)
+    y_pred = model.predict(x_test)
+
+    # TODO: add something to print _x_scores multimodally
+
+    if extended:
+        # We can see the classes the model used
+        print(model.classes_)
+        # See the intercept of the model
+        print(model.intercept_)
+        # See the coefficients of the model - that can be easily interpreted
+        # (correlating or not with y)
+        print(model.coef_)
+
+        '''
+        Evaluate the model: each sample has a probability of belonging to Positive
+        or Negative outcome. Class 0 is Negative, class 1 is Positive.  If the value
+        of the first column (probability of being Negative) is higher than 0.5, we
+        have a Negative sample. Otherwise, it will be Positive
+        '''
+        probabilities = model.predict_proba(x_train)
+        print(probabilities)
+
+        # This tells us the accuracy of our model in calibration
+        model.score(x_train, y_train)
+
+        predictions = model.predict(x_train)
+
+        print("Calibration predictions: ")
+        print(predictions)
+        print_confusion_matrix(y_train, predictions, "Confusion matrix based on training set")
+
+    if isinstance(model, PLSR):
+        y_pred = np.int8(np.abs(np.around(y_pred, decimals=0)))
+
+    print_confusion_matrix(y_test, y_pred, "Confusion matrix based on evaluation set")
+
+# TODO: make multimodal
+def print_confusion_matrix(y1, y2, title):
+    '''Function to simplify the plotting of confusion matrices'''
+    cm = confusion_matrix(y1, y2)
+
+    # Get unique class labels from y_true
+    class_labels = sorted(set(y2))
+
+    # Plot the confusion matrix using seaborn with custom colormap (Blues)
+    sns.heatmap(cm,
+        annot=True,
+        fmt='d',
+        cmap='Blues',
+        xticklabels=class_labels,
+        yticklabels=class_labels,
+        cbar=False,
+        vmin=0,
+        vmax=cm.max()
+    )
+
+    plt.xlabel('Predicted')
+    plt.ylabel('True')
+    plt.title(title)
+    plt.show()
+
+    # Print the classification report
+    print(classification_report(y1, y2, digits=2))
diff --git a/chemfusekit/knn.py b/chemfusekit/knn.py
@@ -3,15 +3,11 @@
 from beartype.typing import Callable
 
 from sklearn.neighbors import KNeighborsClassifier
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import confusion_matrix, classification_report
 
 import pandas as pd
 
-import seaborn as sns
-import matplotlib.pyplot as plt
-
 from chemfusekit.lldf import LLDFModel
+from chemfusekit.__utils import run_split_test, print_confusion_matrix
 
 class KNNSettings:
     '''Holds the settings for the kNN object.'''
@@ -50,10 +46,6 @@ def __init__(
 class KNN:
     '''Class to store the data, methods and artifacts for k-Nearest Neighbors Analysis'''
     def __init__(self, settings: KNNSettings, fused_data: LLDFModel):
-        if type(settings) is not KNNSettings:
-            raise TypeError("Invalid settings: should be a KNNSettings-class object.")
-        if type(fused_data) is not LLDFModel:
-            raise TypeError("Invalid fused_data: shold be a LLDFModel-class object.")
         self.settings = settings
         self.fused_data = fused_data
         self.model: Optional[KNeighborsClassifier] = None
@@ -77,80 +69,21 @@ def knn(self):
             y_pred = knn.predict(self.fused_data.x_data)
             print(y_pred)
 
-            # Assuming 'y_true' and 'y_pred' are your true and predicted labels
-            cm = confusion_matrix(self.fused_data.y, y_pred)
-
-            # Get unique class labels from y_true
-            class_labels = sorted(set(self.fused_data.y))
-
-            # Plot the confusion matrix using seaborn with custom colormap (Blues)
-            sns.heatmap(
-                cm,
-                annot=True,
-                fmt='d',
-                cmap='Blues',
-                xticklabels=class_labels,
-                yticklabels=class_labels,
-                cbar=False, vmin=0,
-                vmax=cm.max()
-            )
-
-            plt.xlabel('Predicted')
-            plt.ylabel('True')
-            plt.title('Confusion Matrix based on the whole data set')
-            plt.show()
-
-            # Print the classification report
-            print(classification_report(self.fused_data.y, y_pred, digits=2))
-
-        if self.settings.test_split and self.settings.output:
-            # Split the data into a training set and a test set
-            x_train, x_test, y_train, y_test = train_test_split(
-                self.fused_data.x_data,
+            print_confusion_matrix(
                 self.fused_data.y,
-                test_size=0.3,
-                random_state=42
+                y_pred,
+                "Confusion Matrix based on the whole data set"
             )
 
-            # Train the kNN model on the training section of the dataset
-            knn = KNeighborsClassifier(
+        if self.settings.test_split and self.settings.output:
+            knn_split = KNeighborsClassifier(
                 n_neighbors=self.settings.n_neighbors,
                 metric=self.settings.metric,
                 weights=self.settings.weights,
                 algorithm=self.settings.algorithm
             )
-            knn.fit(x_train, y_train)
-
-            # View the prediction on the test data
-            y_pred = knn.predict(x_test)
-            print(y_pred)
-
-            # Assuming 'y_true' and 'y_pred' are your true and predicted labels
-            cm = confusion_matrix(y_test, y_pred)
-
-            # Get unique class labels from y_true
-            class_labels = sorted(set(y_test))
-
-            # Plot the confusion matrix using seaborn with custom colormap (Blues)
-            sns.heatmap(
-                cm,
-                annot=True,
-                fmt='d',
-                cmap='Blues',
-                xticklabels=class_labels,
-                yticklabels=class_labels,
-                cbar=False,
-                vmin=0,
-                vmax=cm.max()
-            )
-
-            plt.xlabel('Predicted')
-            plt.ylabel('True')
-            plt.title('Confusion Matrix based on evaluation set')
-            plt.show()
+            run_split_test(self.fused_data.x_data, self.fused_data.y, knn_split)
 
-            # Print the classification report
-            print(classification_report(y_test, y_pred, digits=2))
 
     def predict(self, x_data: pd.DataFrame):
         '''Performs kNN prediction once the model is trained.'''

diff --git a/chemfusekit/lda.py b/chemfusekit/lda.py
@@ -6,22 +6,23 @@
 
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LD
 from sklearn.model_selection import train_test_split
-
 from sklearn.metrics import confusion_matrix, classification_report
 
-import plotly.express as px
-import seaborn as sns
-import matplotlib.pyplot as plt
-
 from chemfusekit.lldf import LLDFModel
+from chemfusekit.__utils import graph_output, run_split_test
 
 class LDASettings:
     '''Holds the settings for the LDA object.'''
-    def __init__(self, components: int = 3, output: bool = False):
+    def __init__(self, components: int = 3, output: bool = False, test_split: bool = False):
         if components <= 2:
             raise ValueError("Invalid component number: must be a > 1 integer.")
+        if test_split is True and output is False:
+            raise Warning(
+                "You selected test_split but it won't run because you disabled the output."
+            )
         self.components = components
         self.output = output
+        self.test_split = test_split
 
 class LDA:
     '''Class to store the data, methods and artifacts for Linear Discriminant Analysis'''
@@ -56,81 +57,20 @@ def lda(self):
 
         scores = pd.concat([scores, y_dataframe], axis = 1)
 
-        if self.settings.output:
-            print(scores)
-
-            print(f"""
-                explained variance ratio (three components) with LDA:
-                {lda.explained_variance_ratio_}
-            """)
-
-            # Display the explained variance ratio
-            print("Explained Variance Ratio:", lda.explained_variance_ratio_)
-
-            #Scores plot
-            fig = px.scatter(scores, x="LV1", y="LV2", color="Substance", hover_data=['Substance'])
-            fig.update_xaxes(zeroline=True, zerolinewidth=1, zerolinecolor='Black')
-            fig.update_yaxes(zeroline=True, zerolinewidth=1, zerolinecolor='Black')
-            fig.update_layout(
-                height=600,
-                width=800,
-                title_text='Scores Plot')
-            fig.show()
-
-            # Plot 3D scores
-            fig = px.scatter_3d(scores, x='LV1', y='LV2', z='LV3',
-                                color='Substance', hover_data=['Substance'],
-                                hover_name=scores.index
-            )
-            fig.update_layout(
-            title_text='3D colored by Substance for Linear Discriminant Analysis')
-            fig.show()
-
-        lda2 = LD(n_components=self.settings.components)
-
-        self.x_train, x_test, y_train, y_test = train_test_split(
-            (scores.drop('Substance', axis=1).values),
-            self.y,
-            test_size=0.3,
-            random_state=42
-        )
-
-        lda2.fit(self.x_train, y_train)
-        lda2.predict(x_test)
-        y_pred = lda2.predict(x_test)
-
-        if self.settings.output:
-            self.__print_prediction_graphs(y_test, y_pred)
-
+        # Store the traiend model
         self.model = lda
 
-    def __print_prediction_graphs(self, y_test, y_pred):
-        '''Helper function to print graphs and stats about LDA predictions.'''
-        # Assuming 'y_test' and 'y_pred' are your true and predicted labels
-        cm = confusion_matrix(y_test, y_pred)
-
-        # Get unique class labels from y_true
-        class_labels = sorted(set(y_test))
-
-        # Plot the confusion matrix using seaborn with custom colormap (Blues)
-        sns.heatmap(cm,
-            annot=True,
-            fmt='d',
-            cmap='Blues',
-            xticklabels=class_labels,
-            yticklabels=class_labels,
-            cbar=False,
-            vmin=0,
-            vmax=cm.max()
-        )
-
-        plt.xlabel('Predicted')
-        plt.ylabel('True')
-        plt.title('Confusion Matrix based on evaluation set')
-        plt.show()
-
-        # Print the classification report
-        print(classification_report(y_test, y_pred, digits=2))
+        # Show graphs if required by the user
+        if self.settings.output:
+            graph_output(scores, self.model, "Linear Discriminant Analysis")
+
+            # Run split tests if required by the user
+            if self.settings.test_split:
+                run_split_test(
+                    (scores.drop('Substance', axis=1).values),
+                    self.y,
+                    LD(n_components=self.settings.components)
+                )
 
     def predict(self, x_data: pd.DataFrame):
         '''Performs LDA prediction once the model is trained.'''