Merge pull request #36 from f-aguzzi/pre/beta

Pre/beta into main: release 2.0.0
f-aguzzi · Jun 4, 2024 · 6896b35 · 6896b35
2 parents 862ddf6 + 330ad59
commit 6896b35
Show file tree

Hide file tree

Showing 127 changed files with 2,298 additions and 295 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,64 @@
+## [2.0.0-beta.4](https://github.com/f-aguzzi/tesi/compare/v2.0.0-beta.3...v2.0.0-beta.4) (2024-06-04)
+
+
+### Bug Fixes
+
+* lda and lr_tests missing arguments ([280159d](https://github.com/f-aguzzi/tesi/commit/280159d8208f46a2a843e9eeae60d82114e15261))
+
+
+### Docs
+
+* new blog post ([afed9f7](https://github.com/f-aguzzi/tesi/commit/afed9f7620d06559892a517daef4f78192d3f3e2))
+
+## [2.0.0-beta.3](https://github.com/f-aguzzi/tesi/compare/v2.0.0-beta.2...v2.0.0-beta.3) (2024-06-04)
+
+
+### Features
+
+* **LDA:** autodetect components from PCADataModel ([a59cd54](https://github.com/f-aguzzi/tesi/commit/a59cd545e9926de94117f2a46be5801c24271ba8))
+
+
+### Docs
+
+* fix wrong version number ([a0eb4e6](https://github.com/f-aguzzi/tesi/commit/a0eb4e6110dc25a5a8a4e6e72ff7ba02c05f6a14))
+* update examples ([c919596](https://github.com/f-aguzzi/tesi/commit/c919596c94a7fd0c54548027d05c857c758054c9))
+
+
+### Refactor
+
+* moved prediction into base class ([57a3497](https://github.com/f-aguzzi/tesi/commit/57a349743964db553aa6cea425631022c37920b3))
+* **lldf:** switch arguments in constructor call ([fcf7471](https://github.com/f-aguzzi/tesi/commit/fcf7471ba519ceed7747f48895421e932506b835))
+
+## [2.0.0-beta.2](https://github.com/f-aguzzi/tesi/compare/v2.0.0-beta.1...v2.0.0-beta.2) (2024-06-04)
+
+
+### Features
+
+* made LR inherit from BaseClassifier ([d06a7db](https://github.com/f-aguzzi/tesi/commit/d06a7db270a99517a6445c94bdfacc1901e90121))
+
+
+### Docs
+
+* new version ([7c96050](https://github.com/f-aguzzi/tesi/commit/7c96050e20382fdd2312584dd0cf8ee091329181))
+
+## [2.0.0-beta.1](https://github.com/f-aguzzi/tesi/compare/v1.2.0...v2.0.0-beta.1) (2024-06-04)
+
+
+### ⚠ BREAKING CHANGES
+
+* added base class for classifiers, models, settings
+
+### Features
+
+* added base class for classifiers, models, settings ([4af5d47](https://github.com/f-aguzzi/tesi/commit/4af5d4778d28021dcd2e23f00fc5810ae178769d))
+
+
+### Docs
+
+* fix broken github pages build ([862ddf6](https://github.com/f-aguzzi/tesi/commit/862ddf6557973229ec9b85830b677822db0f9da7))
+* new cookbook section ([fd9a243](https://github.com/f-aguzzi/tesi/commit/fd9a2435469bdcf0457909ec3424ce1af5b118a9))
+* updated docusaurus with versioning ([0b6d5c4](https://github.com/f-aguzzi/tesi/commit/0b6d5c4319f371a757ad0fc3a142e2eb1d959137)), closes [#33](https://github.com/f-aguzzi/tesi/issues/33)
+
 ## [1.2.0](https://github.com/f-aguzzi/tesi/compare/v1.1.3...v1.2.0) (2024-06-03)
 
 

diff --git a/chemfusekit/__base.py b/chemfusekit/__base.py
@@ -0,0 +1,52 @@
+'''A base class for all classifiers.'''
+
+import pandas as pd
+import numpy as np
+import joblib
+from .__utils import GraphMode
+
+
+class BaseDataModel:
+    '''Models the output data from data-outputting operations'''
+    def __init__(self, x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray):
+        self.x_data = x_data
+        self.x_train = x_train
+        self.y = y
+
+
+class BaseSettings:
+    '''Holds the settings for the BaseClassifier object.'''
+    def __init__(self, output: GraphMode = GraphMode.NONE, test_split: bool = False):
+        if test_split is True and output is GraphMode.NONE:
+            raise Warning(
+                "You selected test_split but it won't run because you disabled the output."
+            )
+        self.output = output
+        self.test_split = test_split
+
+
+class BaseClassifier:
+    '''Parent class for all classifiers, containing basic shared utilities.'''
+    def __init__(self, settings: BaseSettings, data: BaseDataModel):
+        self.settings = settings
+        self.data = data
+        self.model = None
+
+    def import_model(self, import_path: str):
+        joblib.load(self.model, import_path)
+
+    def export_model(self, export_path: str):
+        if self.model is not None:
+            joblib.dump(self.model, export_path)
+        else:
+            raise RuntimeError("You haven't trained the model yet! You cannot export it now.")
+
+    def predict(self, x_data: pd.DataFrame):
+        '''Performs prediction once the model is trained.'''
+        if x_data is None:
+            raise TypeError(f"X data for {self.__class__.__name__} prediction must be non-empty.")
+        if self.model is None:
+            raise RuntimeError(f"The {self.__class__.__name__} model is not trained yet!")
+
+        y_pred = self.model.predict(x_data)
+        return y_pred
diff --git a/chemfusekit/knn.py b/chemfusekit/knn.py
@@ -6,20 +6,18 @@
 
 import pandas as pd
 
-from chemfusekit.lldf import LLDFModel
+from chemfusekit.lldf import LLDFDataModel
 from chemfusekit.__utils import run_split_test, print_confusion_matrix, print_table, GraphMode
+from .__base import BaseSettings, BaseClassifier
 
-class KNNSettings:
+
+class KNNSettings(BaseSettings):
     '''Holds the settings for the kNN object.'''
-    def __init__(
-            self,
-            n_neighbors: int = 15,
-            metric: str | Callable = 'euclidean',
-            weights: str | Callable = 'uniform',
-            algorithm: str = 'auto',
-            output: GraphMode = GraphMode.NONE,
-            test_split: bool = False
-        ):
+    def __init__(self, n_neighbors: int = 15, metric: str | Callable = 'euclidean', weights: str | Callable = 'uniform',
+                 algorithm: str = 'auto', output: GraphMode = GraphMode.NONE, test_split: bool = False):
+
+        super().__init__(output, test_split)
+
         if n_neighbors < 1:
             raise ValueError("Invalid n_neighbors number: should be a positive integer.")
         if metric not in ['minkwoski', 'precomputed', 'euclidean'] and not callable(metric):
@@ -32,23 +30,16 @@ def __init__(
             raise  ValueError(
                 "Invalid algorithm: should be 'auto', 'ball_tree', 'kd_tree' or 'brute'."
             )
-        if test_split is True and output is GraphMode.NONE:
-            raise Warning(
-                "You selected test_split but it won't run because you disabled the output."
-            )
         self.n_neighbors = n_neighbors
         self.metric = metric
         self.weights = weights
         self.algorithm = algorithm
-        self.output = output
-        self.test_split = test_split
 
-class KNN:
+
+class KNN(BaseClassifier):
     '''Class to store the data, methods and artifacts for k-Nearest Neighbors Analysis'''
-    def __init__(self, settings: KNNSettings, fused_data: LLDFModel):
-        self.settings = settings
-        self.fused_data = fused_data
-        self.model: Optional[KNeighborsClassifier] = None
+    def __init__(self, settings: KNNSettings, fused_data: LLDFDataModel):
+        super().__init__(settings, fused_data)
 
     def knn(self):
         '''Performs k-Nearest Neighbors Analysis'''
@@ -59,13 +50,13 @@ def knn(self):
             weights=self.settings.weights,
             algorithm=self.settings.algorithm
         )
-        knn.fit(self.fused_data.x_data, self.fused_data.y)
+        knn.fit(self.data.x_data, self.data.y)
 
         # Save the trained model
         self.model = knn
 
         # View the prediction on the test data
-        y_pred = knn.predict(self.fused_data.x_data)
+        y_pred = knn.predict(self.data.x_data)
         print_table(
             ["Predictions"],
             y_pred.reshape(1,len(y_pred)),
@@ -74,7 +65,7 @@ def knn(self):
         )
 
         print_confusion_matrix(
-            self.fused_data.y,
+            self.data.y,
             y_pred,
             "Confusion Matrix based on the whole data set",
             self.settings.output
@@ -87,15 +78,4 @@ def knn(self):
                 weights=self.settings.weights,
                 algorithm=self.settings.algorithm
             )
-            run_split_test(self.fused_data.x_data, self.fused_data.y, knn_split)
-
-
-    def predict(self, x_data: pd.DataFrame):
-        '''Performs kNN prediction once the model is trained.'''
-        if x_data is None:
-            raise TypeError("X data for kNN prediction must be non-empty.")
-        if self.model is None:
-            raise RuntimeError("The kNN model is not trained yet!")
-
-        y_pred = self.model.predict(x_data)
-        return y_pred
+            run_split_test(self.data.x_data, self.data.y, knn_split)
diff --git a/chemfusekit/lda.py b/chemfusekit/lda.py
@@ -6,39 +6,38 @@
 
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LD
 
-from chemfusekit.lldf import LLDFModel
+from chemfusekit.lldf import LLDFDataModel
 from chemfusekit.__utils import graph_output, run_split_test
 from chemfusekit.__utils import print_confusion_matrix, print_table, GraphMode
+from .__base import BaseDataModel, BaseClassifier, BaseSettings
+from .pca import PCADataModel
 
-class LDASettings:
+
+class LDASettings(BaseSettings):
     '''Holds the settings for the LDA object.'''
-    def __init__(self, components: int = 3, output: GraphMode = GraphMode.NONE,
-                 test_split: bool = False):
+    def __init__(self, components: int = 3, output: GraphMode = GraphMode.NONE, test_split: bool = False):
+        super().__init__(output, test_split)
         if components <= 2:
             raise ValueError("Invalid component number: must be a > 1 integer.")
-        if test_split is True and output is GraphMode.NONE:
-            raise Warning(
-                "You selected test_split but it won't run because you disabled the output."
-            )
         self.components = components
-        self.output = output
-        self.test_split = test_split
 
-class LDA:
+
+class LDA(BaseClassifier):
     '''Class to store the data, methods and artifacts for Linear Discriminant Analysis'''
-    def __init__(self, lldf_model: LLDFModel, settings: LDASettings):
+    def __init__(self, settings: LDASettings, data: BaseDataModel):
+        super().__init__(settings, data)
         self.settings = settings
-        self.x_data = lldf_model.x_data
-        self.x_train = lldf_model.x_train
-        self.y = lldf_model.y
-        self.model: Optional[LD] = None
+        self.data = data
+        # Self-detect components if the data is from PCA
+        if isinstance(data, PCADataModel):
+            self.settings.components = data.components - 1
 
     def lda(self):
         '''Performs Linear Discriminant Analysis'''
 
         lda = LD(n_components=self.settings.components) # N-1 where N are the classes
-        scores_lda = lda.fit(self.x_data, self.y).transform(self.x_data)
-        pred = lda.predict(self.x_data)
+        scores_lda = lda.fit(self.data.x_data, self.data.y).transform(self.data.x_data)
+        pred = lda.predict(self.data.x_data)
 
         print_table(
             [f"LV{i+1}" for i in range(scores_lda.shape[1])],
@@ -75,22 +74,22 @@ def lda(self):
             self.settings.output
         )
 
-        pred = lda.predict(self.x_data)
+        pred = lda.predict(self.data.x_data)
         print_confusion_matrix(
-            y1=self.y,
+            y1=self.data.y,
             y2=pred,
             title="LDA Training Confusion Matrix",
             mode=self.settings.output
         )
 
         lv_cols = [f'LV{i+1}' for i in range(self.settings.components)]
-        scores = pd.DataFrame(data = scores_lda, columns = lv_cols) # latent variables
-        scores.index = self.x_data.index
-        y_dataframe = pd.DataFrame(self.y, columns=['Substance'])
+        scores = pd.DataFrame(data=scores_lda, columns=lv_cols)     # latent variables
+        scores.index = self.data.x_data.index
+        y_dataframe = pd.DataFrame(self.data.y, columns=['Substance'])
 
         scores = pd.concat([scores, y_dataframe], axis = 1)
 
-        # Store the traiend model
+        # Store the trained model
         self.model = lda
 
         # Show graphs if required by the user
@@ -104,18 +103,8 @@ def lda(self):
         # Run split tests if required by the user
         if self.settings.test_split:
             run_split_test(
-                (scores.drop('Substance', axis=1).values),
-                self.y,
+                scores.drop('Substance', axis=1).values,
+                self.data.y,
                 LD(n_components=self.settings.components),
                 mode=self.settings.output
             )
-
-    def predict(self, x_data: pd.DataFrame):
-        '''Performs LDA prediction once the model is trained.'''
-        if x_data is None:
-            raise TypeError("X data for LDA prediction must be non-empty.")
-        if self.model is None:
-            raise RuntimeError("The LDA model is not trained yet!")
-
-        y_pred = self.model.predict(x_data)
-        return y_pred
diff --git a/chemfusekit/lldf.py b/chemfusekit/lldf.py
@@ -8,7 +8,7 @@
 import matplotlib
 import matplotlib.pyplot as plt
 
-from .__utils import GraphMode
+from .__base import GraphMode, BaseDataModel
 
 
 class Table:
@@ -19,12 +19,10 @@ def __init__(self, file_path: str, sheet_name: str, preprocessing: str):
         self.preprocessing = preprocessing
 
 
-class LLDFModel:
+class LLDFDataModel(BaseDataModel):
     '''Models the output data from the LLDF operation'''
     def __init__(self, x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray):
-        self.x_data = x_data
-        self.x_train = x_train
-        self.y = y
+        super().__init__(x_data, x_train, y)
 
 
 class LLDFSettings:
@@ -40,19 +38,19 @@ def _snv(input_data: np.ndarray):
     for i in range(input_data.shape[0]):
 
         # Apply correction
-        output_data[i,:] = (
-            (input_data[i,:] - np.mean(input_data[i,:])) / np.std(input_data[i,:])
+        output_data[i, :] = (
+            (input_data[i, :] - np.mean(input_data[i,:])) / np.std(input_data[i, :])
         )
 
     return output_data
 
 
 class LLDF:
     '''Holds together all the data, methods and artifacts of the LLDF operation'''
-    def __init__(self, tables: List[Table], settings: LLDFSettings):
+    def __init__(self, settings: LLDFSettings, tables: List[Table]):
         self.settings = settings
         self.tables = tables
-        self.fused_data: Optional[LLDFModel] = None
+        self.fused_data: Optional[LLDFDataModel] = None
 
     def lldf(self):
         '''Performs low-level data fusion'''
@@ -139,7 +137,7 @@ def lldf(self):
             axis=1
         )
 
-        self.fused_data = LLDFModel(x_data, x_train, y)
+        self.fused_data = LLDFDataModel(x_data, x_train, y)
 
     def export_data(self, export_path: str):
         '''Exports the data fusion artifacts to a file'''