Skip to content

Commit

Permalink
fix(pca): field and property issues
Browse files Browse the repository at this point in the history
  • Loading branch information
f-aguzzi committed Jun 12, 2024
1 parent 43ee55a commit 91cefd3
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 28 deletions.
7 changes: 5 additions & 2 deletions chemfusekit/__base.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,6 @@ class BaseReducer(BaseActionClass):

def __init__(self, settings: BaseSettings, data: BaseDataModel):
super().__init__(settings, data)
self.rescaled_data = None

@abstractmethod
def export_data(self) -> BaseDataModel:
Expand All @@ -193,4 +192,8 @@ def reduce(self, data: BaseDataModel) -> BaseDataModel:
x_train=x_train,
y=data.y
)


@property
@abstractmethod
def rescaled_data(self) -> BaseDataModel:
pass
24 changes: 23 additions & 1 deletion chemfusekit/lda.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
'''Linear Discriminant Analysis module'''
from copy import copy
from functools import cached_property
from typing import Optional

import joblib
Expand All @@ -8,7 +9,6 @@

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LD

from chemfusekit.lldf import LLDFDataModel
from chemfusekit.__utils import graph_output, run_split_test
from chemfusekit.__utils import print_confusion_matrix, print_table, GraphMode
from .__base import BaseDataModel, BaseClassifier, BaseClassifierSettings, BaseReducer
Expand Down Expand Up @@ -139,3 +139,25 @@ def export_data(self) -> LDADataModel:
y=self.data.y,
components=self.settings.components
)

@cached_property
def rescaled_data(self) -> BaseDataModel:
if self.model is None:
settings_backup = copy(self.settings)
self.settings.output = GraphMode.NONE
self.settings.test_split = False
self.lda()
self.settings = settings_backup

x_data = pd.DataFrame(self.model.transform(self.data.x_data))
y_dataframe = pd.DataFrame(self.data.y, columns=['Substance'])
x_train = pd.concat(
[y_dataframe, x_data],
axis=1
)

return BaseDataModel(
x_data,
x_train,
self.data.y
)
44 changes: 22 additions & 22 deletions chemfusekit/pca.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
'''Principal Component Analysis Module'''
import functools
from copy import copy
from functools import cached_property
from typing import Optional
Expand Down Expand Up @@ -48,7 +49,7 @@ class PCA(BaseReducer):
def __init__(self, settings: PCASettings, data: BaseDataModel):
super().__init__(settings, data)
self.components = 0
self.pca_model: Optional[PC] = None
self.model: Optional[PC] = None
self.array_scores: Optional[np.ndarray] = None

def pca(self):
Expand Down Expand Up @@ -108,10 +109,10 @@ def pca(self):
plt.ylabel('Cumulative Proportional Variance Explained')
plt.show()

# Run PCA producing the pca_model with a proper number of components
# Run PCA producing the model with a proper number of components
pca = PC(n_components=self.components)
self.pca_model = pca
self.pca_model.fit(x_data)
self.model = pca
self.model.fit(x_data)

def pca_stats(self):
'''Produces PCA-related statistics.'''
Expand All @@ -120,28 +121,28 @@ def pca_stats(self):

# Prepare the Scores dataframe (and concatenate the original 'Region' variable)
pc_cols = [f"PC{i+1}" for i in range(self.components)]
scores = pd.DataFrame(data=self.pca_model.fit_transform(x_data), columns=pc_cols)
scores = pd.DataFrame(data=self.model.fit_transform(x_data), columns=pc_cols)
scores.index = x_data.index
scores = pd.concat([scores, x_train.Substance], axis = 1)
scores = pd.concat([scores, x_train.Substance], axis=1)

print_table(
pc_cols + ['Substance'],
[scores.iloc[:,i] for i in range(scores.shape[1])],
[scores.iloc[:, i] for i in range(scores.shape[1])],
"PCA scores for each component",
self.settings.output
)

# Prepare the loadings dataframe
loadings = pd.DataFrame(
self.pca_model.components_.T,
self.model.components_.T,
columns=pc_cols,
index=x_data.columns
)
loadings["Attributes"] = loadings.index

print_table(
pc_cols + ['Retention Time'],
[loadings.iloc[:,i] for i in range(loadings.shape[1])],
[loadings.iloc[:, i] for i in range(loadings.shape[1])],
"PCA Loadings",
self.settings.output
)
Expand Down Expand Up @@ -177,9 +178,9 @@ def pca_stats(self):
fig.show()

# Get PCA scores
t = scores.iloc[:,0:self.components]
t = scores.iloc[:, 0:self.components]
# Get PCA loadings
p = loadings.iloc[:,0:self.components]
p = loadings.iloc[:, 0:self.components]
# Calculate error array
err = x_data - np.dot(t,p.T)
# Calculate Q-residuals (sum over the rows of the error array)
Expand All @@ -206,7 +207,7 @@ def mean_confidence_interval(data, confidence=self.settings.confidence_level):
# Create a dataframe using only T2 and Q-residuals
hot_q_data = pd.DataFrame(
{'T2': tsq, 'Qres': q, 'Substance': x_train.Substance},
index = x_data.index
index=x_data.index
)

if self.settings.output is GraphMode.GRAPHIC:
Expand All @@ -216,7 +217,7 @@ def mean_confidence_interval(data, confidence=self.settings.confidence_level):
x="T2",
y="Qres",
hover_data={'Sample': (hot_q_data.index)},
color = "Substance"
color="Substance"
)
fig.add_hline(y=abs(q_conf),line_dash="dot", line_color='Red')
fig.add_vline(x=tsq_conf,line_dash="dot", line_color='Red')
Expand Down Expand Up @@ -264,7 +265,7 @@ def mean_confidence_interval(data, confidence=self.settings.confidence_level):

print_table(
pc_cols,
[array_scores[:,i] for i in range(array_scores.shape[1])],
[array_scores[:, i] for i in range(array_scores.shape[1])],
"Array without 'Substance' column",
self.settings.output
)
Expand All @@ -273,7 +274,7 @@ def mean_confidence_interval(data, confidence=self.settings.confidence_level):

def export_data(self) -> PCADataModel:
'''Export data artifacts.'''
if self.pca_model is None or self.array_scores is None:
if self.model is None or self.array_scores is None:
raise RuntimeError("Run both pca() and pca_stats() methods before exporting data!")

return PCADataModel(
Expand All @@ -286,15 +287,14 @@ def export_data(self) -> PCADataModel:

@cached_property
def rescaled_data(self) -> PCADataModel:
if self.array_scores is None:
if self.model is None:
settings_backup = copy(self.settings)
self.settings.output = GraphMode.NONE
if self.pca_model is None:
self.pca()
self.pca()
self.pca_stats()
self.settings = settings_backup

x_data = pd.DataFrame(self.pca_model.transform(self.data.x_data))
x_data = pd.DataFrame(self.model.transform(self.data.x_data))
y_dataframe = pd.DataFrame(self.data.y, columns=['Substance'])
x_train = pd.concat(
[y_dataframe, x_data],
Expand Down Expand Up @@ -324,7 +324,7 @@ def from_file(cls, settings: PCASettings, model_path: str):
np.asarray(pd.DataFrame)
)
class_instance = PCA(settings, data)
class_instance.pca_model = model
class_instance.model = model
return class_instance

def import_model(self, import_path: str):
Expand All @@ -336,7 +336,7 @@ def import_model(self, import_path: str):

def export_model(self, export_path: str):
'''Exports the underlying sklearn PCA model to a file.'''
if self.pca_model is not None:
joblib.dump(self.pca_model, export_path)
if self.model is not None:
joblib.dump(self.model, export_path)
else:
raise RuntimeError("You haven't trained the model yet! You cannot export it now.")
2 changes: 1 addition & 1 deletion docs/docs/pca/pca.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ PCA(settings: PCASettings, data: BaseDataModel)

- `fused_data`: object of type [`LLDF`](../lldf/lldf-class.md). Contains the data to be analyzed.
- `components`: Number of components for the PCA analysis. Defaults to 0.
- `pca_model`: A `PCA` model from `scikit-learn`. Defaults to `None`.
- `model`: A `PCA` model from `scikit-learn`. Defaults to `None`.
- `settings`: object of type [`PCASettings`](./pcasettings.md). Contains the settings for
the `PCA` object.

Expand Down
2 changes: 1 addition & 1 deletion docs/versioned_docs/version-2.5.0/pca/pca.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ PCA(settings: PCASettings, data: BaseDataModel)

- `fused_data`: object of type [`LLDF`](../lldf/lldf-class.md). Contains the data to be analyzed.
- `components`: Number of components for the PCA analysis. Defaults to 0.
- `pca_model`: A `PCA` model from `scikit-learn`. Defaults to `None`.
- `model`: A `PCA` model from `scikit-learn`. Defaults to `None`.
- `settings`: object of type [`PCASettings`](./pcasettings.md). Contains the settings for
the `PCA` object.

Expand Down
Binary file modified pca_model.sklearn
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def test_pca_import_export(self):
pca2 = PCA.from_file(pca_settings, 'pca_model.sklearn')

# Assert the equality of the two models
self.assertEqual(pca.pca_model.get_params(), pca2.pca_model.get_params())
self.assertEqual(pca.model.get_params(), pca2.model.get_params())

def test_pca_reduce(self):
'''Test case for data dimensionality reduction.'''
Expand Down

0 comments on commit 91cefd3

Please sign in to comment.