feat: feature number autoselection in PLSDA and LDA

f-aguzzi · Jun 18, 2024 · a4f8983 · a4f8983
1 parent f9e3431
commit a4f8983
Show file tree

Hide file tree

Showing 5 changed files with 45 additions and 19 deletions.
diff --git a/chemfusekit/__base.py b/chemfusekit/__base.py
@@ -1,5 +1,6 @@
 """A base class for all classifiers."""
 from abc import ABC, abstractmethod
+from typing import Optional
 
 import pandas as pd
 import numpy as np
@@ -182,6 +183,7 @@ class BaseReducer(BaseActionClass):
 
     def __init__(self, settings: BaseSettings, data: BaseDataModel):
         super().__init__(settings, data)
+        self.array_scores: Optional[np.ndarray] = None
 
     @abstractmethod
     def export_data(self) -> BaseDataModel:

diff --git a/chemfusekit/lda.py b/chemfusekit/lda.py
@@ -1,11 +1,13 @@
 """Linear Discriminant Analysis module"""
 from copy import copy
 from functools import cached_property
+from typing import Optional
 
 import numpy as np
 import pandas as pd
 
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LD
+from sklearn.model_selection import cross_val_score
 
 from chemfusekit.__utils import graph_output, run_split_test
 from chemfusekit.__utils import print_confusion_matrix, print_table, GraphMode
@@ -23,7 +25,7 @@ def __init__(self, x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray, c
 class LDASettings(BaseClassifierSettings):
     """Holds the settings for the LDA object."""
 
-    def __init__(self, components: int = 3, output: GraphMode = GraphMode.NONE, test_split: bool = False):
+    def __init__(self, components: int | None = None, output: GraphMode = GraphMode.NONE, test_split: bool = False):
         super().__init__(output, test_split)
         if components <= 2:
             raise ValueError("Invalid component number: must be a > 1 integer.")
@@ -40,17 +42,22 @@ def __init__(self, settings: LDASettings, data: BaseDataModel):
         # Self-detect components if the data is from PCA
         if isinstance(data, ReducerDataModel):
             self.settings.components = data.components - 1
+        self.array_scores: Optional[np.ndarray] = None
 
     def train(self):
         """Performs Linear Discriminant Analysis"""
 
-        lda = LD(n_components=self.settings.components)  # N-1 where N are the classes
-        scores_lda = lda.fit(self.data.x_data, self.data.y).transform(self.data.x_data)
+        # Auto-selection of the number of components if not specified
+        if self.components is None:
+            self._select_feature_number(self.data.x_data, self.data.y)
+
+        lda = LD(n_components=self.settings.components)
+        self.array_scores = lda.fit_transform(self.data.x_data, self.data.y)
         pred = lda.predict(self.data.x_data)
 
         print_table(
-            [f"LV{i + 1}" for i in range(scores_lda.shape[1])],
-            list(zip(*scores_lda)),
+            [f"LV{i + 1}" for i in range(self.array_scores.shape[1])],
+            list(zip(*self.array_scores)),
             "LDA scores",
             self.settings.output
         )
@@ -94,7 +101,7 @@ def train(self):
         )
 
         lv_cols = [f'LV{i + 1}' for i in range(self.settings.components)]
-        scores = pd.DataFrame(data=scores_lda, columns=lv_cols)  # latent variables
+        scores = pd.DataFrame(data=self.array_scores, columns=lv_cols)  # latent variables
         scores.index = self.data.x_data.index
         y_dataframe = pd.DataFrame(self.data.y, columns=['Substance'])
 
@@ -158,3 +165,15 @@ def rescaled_data(self) -> BaseDataModel:
             x_train,
             self.data.y
         )
+
+    def _select_feature_number(self, x, y):
+        # Auto-select the number of components
+        max_comps = min(self.data.x_data.shape[1], self.settings.components)
+        n_components = np.arange(1, max_comps + 1)
+        cv_scores = []
+        for n in n_components:
+            lda = LD(n_components=n)
+            scores = cross_val_score(lda, x, y, cv=5)
+            cv_scores.append(scores.mean())
+        # Select the number of components that maximizes the cross-validated score
+        self.components = n_components[np.argmax(cv_scores)]
diff --git a/chemfusekit/plsda.py b/chemfusekit/plsda.py
@@ -15,7 +15,7 @@
 class PLSDASettings(BaseClassifierSettings):
     """Holds the settings for the PLSDA object."""
 
-    def __init__(self, components: int = 5, output: GraphMode = GraphMode.NONE, test_split: bool = False):
+    def __init__(self, components: int | None = None, output: GraphMode = GraphMode.NONE, test_split: bool = False):
         super().__init__(output, test_split)
         if components < 1:
             raise ValueError("Invalid n_components number: should be a positive integer.")
@@ -52,18 +52,9 @@ def train(self):
         x = self.data.x_data
         y = self.data.x_train.Substance.astype('category').cat.codes
 
-        # Autoselect the number of components
-        max_comps = min(self.data.x_data.shape[1], self.settings.components)
-        n_components = np.arange(1, max_comps + 1)
-        cv_scores = []
-
-        for n in n_components:
-            plsda = PLSR(n_components=n)
-            scores = cross_val_score(plsda, x, y, cv=5)
-            cv_scores.append(scores.mean())
-
-        # Select the number of components that maximizes the cross-validated score
-        self.components = n_components[np.argmax(cv_scores)]
+        # Auto-select the number of components only if the current number is null
+        if self.components is None:
+            self._select_feature_number(x, y)
 
         # Re-create the model
         regr_pls = PLSR(n_components=self.components)
@@ -147,6 +138,18 @@ def train(self):
             y = self.data.x_train.Substance.astype('category').cat.codes
             run_split_test(x, y, PLSR(self.settings.components), mode=self.settings.output)
 
+    def _select_feature_number(self, x, y):
+        # Auto-select the number of components
+        max_comps = min(self.data.x_data.shape[1], self.settings.components)
+        n_components = np.arange(1, max_comps + 1)
+        cv_scores = []
+        for n in n_components:
+            plsda = PLSR(n_components=n)
+            scores = cross_val_score(plsda, x, y, cv=5)
+            cv_scores.append(scores.mean())
+        # Select the number of components that maximizes the cross-validated score
+        self.components = n_components[np.argmax(cv_scores)]
+
     def import_model(self, import_path: str):
         model_backup = copy(self.model)
         super().import_model(import_path)

diff --git a/docs/docs/lda/lda.md b/docs/docs/lda/lda.md
@@ -38,6 +38,7 @@ LDA(settings: LDASettings, data: BaseDataModel)
 - `predict(self, x_data)`: performs LDA prediction once the model is trained.
   - *raises*:
     - `RuntimeError("The LDA model is not trained yet!")` if the LDA model hasn't been trained yet
+- `_select_feature_number(x, y)`: auto-selects the number of features using 5-fold cross-validation
 
 ## Example
 

diff --git a/docs/docs/plsda/plsda.md b/docs/docs/plsda/plsda.md
@@ -47,6 +47,7 @@ PLSDA(settings: PLSDASettings, data: BaseDataModel)
 - `reduce(self) -> BaseDataModel`: reduces the dimensionality of the data.
 	+ *raises*:
 		- `RuntimeError("The model hasn't been trained yet! You cannot use it to reduce data dimensionality.")` when run with an untrained `model`.
+- `_select_feature_number(x, y)`: auto-selects the number of features using 5-fold cross-validation
 
 ## Example