Skip to content

Commit

Permalink
feat: feature number autoselection in PLSDA and LDA
Browse files Browse the repository at this point in the history
  • Loading branch information
f-aguzzi committed Jun 18, 2024
1 parent f9e3431 commit a4f8983
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 19 deletions.
2 changes: 2 additions & 0 deletions chemfusekit/__base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""A base class for all classifiers."""
from abc import ABC, abstractmethod
from typing import Optional

import pandas as pd
import numpy as np
Expand Down Expand Up @@ -182,6 +183,7 @@ class BaseReducer(BaseActionClass):

def __init__(self, settings: BaseSettings, data: BaseDataModel):
super().__init__(settings, data)
self.array_scores: Optional[np.ndarray] = None

@abstractmethod
def export_data(self) -> BaseDataModel:
Expand Down
31 changes: 25 additions & 6 deletions chemfusekit/lda.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""Linear Discriminant Analysis module"""
from copy import copy
from functools import cached_property
from typing import Optional

import numpy as np
import pandas as pd

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LD
from sklearn.model_selection import cross_val_score

from chemfusekit.__utils import graph_output, run_split_test
from chemfusekit.__utils import print_confusion_matrix, print_table, GraphMode
Expand All @@ -23,7 +25,7 @@ def __init__(self, x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray, c
class LDASettings(BaseClassifierSettings):
"""Holds the settings for the LDA object."""

def __init__(self, components: int = 3, output: GraphMode = GraphMode.NONE, test_split: bool = False):
def __init__(self, components: int | None = None, output: GraphMode = GraphMode.NONE, test_split: bool = False):
super().__init__(output, test_split)
if components <= 2:
raise ValueError("Invalid component number: must be a > 1 integer.")
Expand All @@ -40,17 +42,22 @@ def __init__(self, settings: LDASettings, data: BaseDataModel):
# Self-detect components if the data is from PCA
if isinstance(data, ReducerDataModel):
self.settings.components = data.components - 1
self.array_scores: Optional[np.ndarray] = None

def train(self):
"""Performs Linear Discriminant Analysis"""

lda = LD(n_components=self.settings.components) # N-1 where N are the classes
scores_lda = lda.fit(self.data.x_data, self.data.y).transform(self.data.x_data)
# Auto-selection of the number of components if not specified
if self.components is None:
self._select_feature_number(self.data.x_data, self.data.y)

lda = LD(n_components=self.settings.components)
self.array_scores = lda.fit_transform(self.data.x_data, self.data.y)
pred = lda.predict(self.data.x_data)

print_table(
[f"LV{i + 1}" for i in range(scores_lda.shape[1])],
list(zip(*scores_lda)),
[f"LV{i + 1}" for i in range(self.array_scores.shape[1])],
list(zip(*self.array_scores)),
"LDA scores",
self.settings.output
)
Expand Down Expand Up @@ -94,7 +101,7 @@ def train(self):
)

lv_cols = [f'LV{i + 1}' for i in range(self.settings.components)]
scores = pd.DataFrame(data=scores_lda, columns=lv_cols) # latent variables
scores = pd.DataFrame(data=self.array_scores, columns=lv_cols) # latent variables
scores.index = self.data.x_data.index
y_dataframe = pd.DataFrame(self.data.y, columns=['Substance'])

Expand Down Expand Up @@ -158,3 +165,15 @@ def rescaled_data(self) -> BaseDataModel:
x_train,
self.data.y
)

def _select_feature_number(self, x, y):
# Auto-select the number of components
max_comps = min(self.data.x_data.shape[1], self.settings.components)
n_components = np.arange(1, max_comps + 1)
cv_scores = []
for n in n_components:
lda = LD(n_components=n)
scores = cross_val_score(lda, x, y, cv=5)
cv_scores.append(scores.mean())
# Select the number of components that maximizes the cross-validated score
self.components = n_components[np.argmax(cv_scores)]
29 changes: 16 additions & 13 deletions chemfusekit/plsda.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
class PLSDASettings(BaseClassifierSettings):
"""Holds the settings for the PLSDA object."""

def __init__(self, components: int = 5, output: GraphMode = GraphMode.NONE, test_split: bool = False):
def __init__(self, components: int | None = None, output: GraphMode = GraphMode.NONE, test_split: bool = False):
super().__init__(output, test_split)
if components < 1:
raise ValueError("Invalid n_components number: should be a positive integer.")
Expand Down Expand Up @@ -52,18 +52,9 @@ def train(self):
x = self.data.x_data
y = self.data.x_train.Substance.astype('category').cat.codes

# Autoselect the number of components
max_comps = min(self.data.x_data.shape[1], self.settings.components)
n_components = np.arange(1, max_comps + 1)
cv_scores = []

for n in n_components:
plsda = PLSR(n_components=n)
scores = cross_val_score(plsda, x, y, cv=5)
cv_scores.append(scores.mean())

# Select the number of components that maximizes the cross-validated score
self.components = n_components[np.argmax(cv_scores)]
# Auto-select the number of components only if the current number is null
if self.components is None:
self._select_feature_number(x, y)

# Re-create the model
regr_pls = PLSR(n_components=self.components)
Expand Down Expand Up @@ -147,6 +138,18 @@ def train(self):
y = self.data.x_train.Substance.astype('category').cat.codes
run_split_test(x, y, PLSR(self.settings.components), mode=self.settings.output)

def _select_feature_number(self, x, y):
# Auto-select the number of components
max_comps = min(self.data.x_data.shape[1], self.settings.components)
n_components = np.arange(1, max_comps + 1)
cv_scores = []
for n in n_components:
plsda = PLSR(n_components=n)
scores = cross_val_score(plsda, x, y, cv=5)
cv_scores.append(scores.mean())
# Select the number of components that maximizes the cross-validated score
self.components = n_components[np.argmax(cv_scores)]

def import_model(self, import_path: str):
model_backup = copy(self.model)
super().import_model(import_path)
Expand Down
1 change: 1 addition & 0 deletions docs/docs/lda/lda.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ LDA(settings: LDASettings, data: BaseDataModel)
- `predict(self, x_data)`: performs LDA prediction once the model is trained.
- *raises*:
- `RuntimeError("The LDA model is not trained yet!")` if the LDA model hasn't been trained yet
- `_select_feature_number(x, y)`: auto-selects the number of features using 5-fold cross-validation

## Example

Expand Down
1 change: 1 addition & 0 deletions docs/docs/plsda/plsda.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ PLSDA(settings: PLSDASettings, data: BaseDataModel)
- `reduce(self) -> BaseDataModel`: reduces the dimensionality of the data.
+ *raises*:
- `RuntimeError("The model hasn't been trained yet! You cannot use it to reduce data dimensionality.")` when run with an untrained `model`.
- `_select_feature_number(x, y)`: auto-selects the number of features using 5-fold cross-validation

## Example

Expand Down

0 comments on commit a4f8983

Please sign in to comment.