feat!: added base class for classifiers, models, settings

f-aguzzi · Jun 4, 2024 · 4af5d47 · 4af5d47
1 parent fd9a243
commit 4af5d47
Show file tree

Hide file tree

Showing 74 changed files with 1,514 additions and 161 deletions.
diff --git a/chemfusekit/__base.py b/chemfusekit/__base.py
@@ -0,0 +1,43 @@
+'''A base class for all classifiers.'''
+
+import pandas as pd
+import numpy as np
+import joblib
+from .__utils import GraphMode
+
+
+class BaseDataModel:
+    '''Models the output data from data-outputting operations'''
+    def __init__(self, x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray):
+        self.x_data = x_data
+        self.x_train = x_train
+        self.y = y
+
+
+class BaseSettings:
+    '''Holds the settings for the BaseClassifier object.'''
+    def __init__(self, output: GraphMode = GraphMode.NONE, test_split: bool = False):
+        if test_split is True and output is GraphMode.NONE:
+            raise Warning(
+                "You selected test_split but it won't run because you disabled the output."
+            )
+        self.output = output
+        self.test_split = test_split
+
+
+class BaseClassifier:
+    '''Parent class for all classifiers, containing basic shared utilities.'''
+    def __init__(self, settings: BaseSettings, data: BaseDataModel):
+        self.settings = settings
+        self.data = data
+        self.model = None
+
+    def import_model(self, import_path: str):
+        joblib.load(self.model, import_path)
+
+    def export_model(self, export_path: str):
+        if self.model is not None:
+            joblib.dump(self.model, export_path)
+        else:
+            raise RuntimeError("You haven't trained the model yet! You cannot export it now.")
+
diff --git a/chemfusekit/knn.py b/chemfusekit/knn.py
@@ -6,20 +6,18 @@
 
 import pandas as pd
 
-from chemfusekit.lldf import LLDFModel
+from chemfusekit.lldf import LLDFDataModel
 from chemfusekit.__utils import run_split_test, print_confusion_matrix, print_table, GraphMode
+from .__base import BaseSettings, BaseClassifier
 
-class KNNSettings:
+
+class KNNSettings(BaseSettings):
     '''Holds the settings for the kNN object.'''
-    def __init__(
-            self,
-            n_neighbors: int = 15,
-            metric: str | Callable = 'euclidean',
-            weights: str | Callable = 'uniform',
-            algorithm: str = 'auto',
-            output: GraphMode = GraphMode.NONE,
-            test_split: bool = False
-        ):
+    def __init__(self, n_neighbors: int = 15, metric: str | Callable = 'euclidean', weights: str | Callable = 'uniform',
+                 algorithm: str = 'auto', output: GraphMode = GraphMode.NONE, test_split: bool = False):
+
+        super().__init__(output, test_split)
+
         if n_neighbors < 1:
             raise ValueError("Invalid n_neighbors number: should be a positive integer.")
         if metric not in ['minkwoski', 'precomputed', 'euclidean'] and not callable(metric):
@@ -32,23 +30,16 @@ def __init__(
             raise  ValueError(
                 "Invalid algorithm: should be 'auto', 'ball_tree', 'kd_tree' or 'brute'."
             )
-        if test_split is True and output is GraphMode.NONE:
-            raise Warning(
-                "You selected test_split but it won't run because you disabled the output."
-            )
         self.n_neighbors = n_neighbors
         self.metric = metric
         self.weights = weights
         self.algorithm = algorithm
-        self.output = output
-        self.test_split = test_split
 
-class KNN:
+
+class KNN(BaseClassifier):
     '''Class to store the data, methods and artifacts for k-Nearest Neighbors Analysis'''
-    def __init__(self, settings: KNNSettings, fused_data: LLDFModel):
-        self.settings = settings
-        self.fused_data = fused_data
-        self.model: Optional[KNeighborsClassifier] = None
+    def __init__(self, settings: KNNSettings, fused_data: LLDFDataModel):
+        super().__init__(settings, fused_data)
 
     def knn(self):
         '''Performs k-Nearest Neighbors Analysis'''
@@ -59,13 +50,13 @@ def knn(self):
             weights=self.settings.weights,
             algorithm=self.settings.algorithm
         )
-        knn.fit(self.fused_data.x_data, self.fused_data.y)
+        knn.fit(self.data.x_data, self.data.y)
 
         # Save the trained model
         self.model = knn
 
         # View the prediction on the test data
-        y_pred = knn.predict(self.fused_data.x_data)
+        y_pred = knn.predict(self.data.x_data)
         print_table(
             ["Predictions"],
             y_pred.reshape(1,len(y_pred)),
@@ -74,7 +65,7 @@ def knn(self):
         )
 
         print_confusion_matrix(
-            self.fused_data.y,
+            self.data.y,
             y_pred,
             "Confusion Matrix based on the whole data set",
             self.settings.output
@@ -87,8 +78,7 @@ def knn(self):
                 weights=self.settings.weights,
                 algorithm=self.settings.algorithm
             )
-            run_split_test(self.fused_data.x_data, self.fused_data.y, knn_split)
-
+            run_split_test(self.data.x_data, self.data.y, knn_split)
 
     def predict(self, x_data: pd.DataFrame):
         '''Performs kNN prediction once the model is trained.'''

diff --git a/chemfusekit/lda.py b/chemfusekit/lda.py
@@ -6,32 +6,29 @@
 
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LD
 
-from chemfusekit.lldf import LLDFModel
+from chemfusekit.lldf import LLDFDataModel
 from chemfusekit.__utils import graph_output, run_split_test
 from chemfusekit.__utils import print_confusion_matrix, print_table, GraphMode
+from .__base import BaseDataModel, BaseClassifier, BaseSettings
 
-class LDASettings:
+
+class LDASettings(BaseSettings):
     '''Holds the settings for the LDA object.'''
-    def __init__(self, components: int = 3, output: GraphMode = GraphMode.NONE,
-                 test_split: bool = False):
+    def __init__(self, components: int = 3, output: GraphMode = GraphMode.NONE, test_split: bool = False):
+        super().__init__(output, test_split)
         if components <= 2:
             raise ValueError("Invalid component number: must be a > 1 integer.")
-        if test_split is True and output is GraphMode.NONE:
-            raise Warning(
-                "You selected test_split but it won't run because you disabled the output."
-            )
         self.components = components
-        self.output = output
-        self.test_split = test_split
 
-class LDA:
+
+class LDA(BaseClassifier):
     '''Class to store the data, methods and artifacts for Linear Discriminant Analysis'''
-    def __init__(self, lldf_model: LLDFModel, settings: LDASettings):
+    def __init__(self, settings: LDASettings, data_model: BaseDataModel):
+        super().__init__(settings, data_model)
         self.settings = settings
-        self.x_data = lldf_model.x_data
-        self.x_train = lldf_model.x_train
-        self.y = lldf_model.y
-        self.model: Optional[LD] = None
+        self.x_data = data_model.x_data
+        self.x_train = data_model.x_train
+        self.y = data_model.y
 
     def lda(self):
         '''Performs Linear Discriminant Analysis'''

diff --git a/chemfusekit/lldf.py b/chemfusekit/lldf.py
@@ -8,7 +8,7 @@
 import matplotlib
 import matplotlib.pyplot as plt
 
-from .__utils import GraphMode
+from .__base import GraphMode, BaseDataModel
 
 
 class Table:
@@ -19,12 +19,10 @@ def __init__(self, file_path: str, sheet_name: str, preprocessing: str):
         self.preprocessing = preprocessing
 
 
-class LLDFModel:
+class LLDFDataModel(BaseDataModel):
     '''Models the output data from the LLDF operation'''
     def __init__(self, x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray):
-        self.x_data = x_data
-        self.x_train = x_train
-        self.y = y
+        super().__init__(x_data, x_train, y)
 
 
 class LLDFSettings:
@@ -40,8 +38,8 @@ def _snv(input_data: np.ndarray):
     for i in range(input_data.shape[0]):
 
         # Apply correction
-        output_data[i,:] = (
-            (input_data[i,:] - np.mean(input_data[i,:])) / np.std(input_data[i,:])
+        output_data[i, :] = (
+            (input_data[i, :] - np.mean(input_data[i,:])) / np.std(input_data[i, :])
         )
 
     return output_data
@@ -52,7 +50,7 @@ class LLDF:
     def __init__(self, tables: List[Table], settings: LLDFSettings):
         self.settings = settings
         self.tables = tables
-        self.fused_data: Optional[LLDFModel] = None
+        self.fused_data: Optional[LLDFDataModel] = None
 
     def lldf(self):
         '''Performs low-level data fusion'''
@@ -139,7 +137,7 @@ def lldf(self):
             axis=1
         )
 
-        self.fused_data = LLDFModel(x_data, x_train, y)
+        self.fused_data = LLDFDataModel(x_data, x_train, y)
 
     def export_data(self, export_path: str):
         '''Exports the data fusion artifacts to a file'''

diff --git a/chemfusekit/lr.py b/chemfusekit/lr.py
@@ -7,11 +7,12 @@
 from sklearn.linear_model import LogisticRegression
 
 from chemfusekit.__utils import run_split_test, print_confusion_matrix, print_table, GraphMode
+from .__base import BaseSettings
 
-class LRSettings:
+class LRSettings(BaseSettings):
     '''Holds the settings for the LR object.'''
-    def __init__(self, algorithm: str = 'liblinear', output: GraphMode = GraphMode.NONE,
-                 test_split: bool = False):
+    def __init__(self, algorithm: str = 'liblinear', output: GraphMode = GraphMode.NONE, test_split: bool = False):
+        super().__init__(output, test_split)
         if algorithm not in [
             'lbfgs',
             'liblinear',
@@ -21,13 +22,7 @@ def __init__(self, algorithm: str = 'liblinear', output: GraphMode = GraphMode.N
             'saga'
         ]:
             raise ValueError(f"{algorithm}: this algorithm does not exist.")
-        if test_split is True and output is GraphMode.NONE:
-            raise Warning(
-                "You selected test_split but it won't run because you disabled the output."
-            )
         self.algorithm = algorithm
-        self.output = output
-        self.test_split = test_split
 
 
 class LR:

diff --git a/chemfusekit/pca.py b/chemfusekit/pca.py
@@ -10,8 +10,9 @@
 
 import scipy.stats
 
-from chemfusekit.lldf import LLDFModel
 from chemfusekit.__utils import print_table, GraphMode
+from .__base import BaseDataModel
+
 
 class PCASettings:
     '''Holds the settings for the PCA object.'''
@@ -32,17 +33,18 @@ def __init__(self, target_variance: float = 0.95,
 
 class PCA:
     '''A class to store the data, methods and artifacts for Principal Component Analysis'''
-    def __init__(self, fused_data: LLDFModel, settings: PCASettings):
-        self.fused_data = fused_data
+    def __init__(self, settings: PCASettings, data: BaseDataModel):
+        self.data = data
         self.components = 0
         self.pca_model: Optional[PC] = None
         self.settings = settings
         self.array_scores: Optional[np.ndarray] = None
 
     def pca(self):
         '''Performs Principal Component Analysis.'''
-        # Read from the data fusion object
-        x_data = self.fused_data.x_data
+
+        # Read from the data fusion object
+        x_data = self.data.x_data
 
         # Run PCA producing the reduced variable Xreg and select the first 10 components
         pca = PC(self.settings.initial_components)
@@ -100,11 +102,10 @@ def pca(self):
         self.pca_model = pca
         self.pca_model.fit_transform(x_data)
 
-
     def pca_stats(self):
         '''Produces PCA-related statistics.'''
-        x_data = self.fused_data.x_data
-        x_train = self.fused_data.x_train
+        x_data = self.data.x_data
+        x_train = self.data.x_train
 
         # Prepare the Scores dataframe (and concatenate the original 'Region' variable)
         pc_cols = [f"PC{i+1}" for i in range(self.components)]
@@ -246,7 +247,6 @@ def mean_confidence_interval(data, confidence=self.settings.confidence_level):
             )
             fig_normalized.show()
 
-
         # Assuming 'scores' is your DataFrame with the 'class' column
         # Drop the 'class' column before converting to NumPy array
         array_scores = scores.drop('Substance', axis=1).values