Skip to content

Commit

Permalink
refactor!: give train() function to BaseActionClass and children
Browse files Browse the repository at this point in the history
  • Loading branch information
f-aguzzi committed Jun 14, 2024
1 parent 77206b5 commit 2e836bc
Show file tree
Hide file tree
Showing 32 changed files with 331 additions and 305 deletions.
5 changes: 5 additions & 0 deletions chemfusekit/__base.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,11 @@ def __init__(self, settings: BaseSettings, data: BaseDataModel):
self.data = data
self.model: BaseEstimator | None = None

@abstractmethod
def train(self):
"""Trains the estimator model."""
pass

@classmethod
def from_file(cls, settings, model_path):
"""Creates a classifier instance from file"""
Expand Down
22 changes: 11 additions & 11 deletions chemfusekit/lldf.py → chemfusekit/df.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ def __init__(self, file_path: str, sheet_name: str, preprocessing: str, class_co
self.index_column = index_column


class LLDFDataModel(BaseDataModel):
"""Models the output data from the LLDF operation"""
class DFDataModel(BaseDataModel):
"""Models the output data from the DF operation"""
def __init__(self, x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray):
super().__init__(x_data, x_train, y)


class LLDFSettings(BaseSettings):
"""Holds the settings for the LLDF object."""
class DFSettings(BaseSettings):
"""Holds the settings for the DF object."""
def __init__(self, output: GraphMode = GraphMode.NONE):
super().__init__(output)

Expand All @@ -47,15 +47,15 @@ def _snv(input_data: np.ndarray):
return output_data


class LLDF:
class DF:
"""Holds together all the data, methods and artifacts of the LLDF operation"""
def __init__(self, settings: LLDFSettings, tables: List[Table]):
def __init__(self, settings: DFSettings, tables: List[Table]):
self.settings = settings
self.tables = tables
self.fused_data: Optional[LLDFDataModel] = None
self.fused_data: Optional[DFDataModel] = None

def lldf(self):
"""Performs low-level data fusion"""
def fuse(self):
"""Performs data fusion"""
x_vector = []
for table in self.tables:
try:
Expand Down Expand Up @@ -114,7 +114,7 @@ def lldf(self):
preprocessed_x = x
else:
raise SyntaxError(
f"LLDF: this type of preprocessing does not exist ({table.preprocessing=})"
f"DF: this type of preprocessing does not exist ({table.preprocessing=})"
)

if self.settings.output is GraphMode.GRAPHIC:
Expand Down Expand Up @@ -182,7 +182,7 @@ def lldf(self):
axis=1
)

self.fused_data = LLDFDataModel(x_data, x_train, y)
self.fused_data = DFDataModel(x_data, x_train, y)

def export_data(self, export_path: str, sheet_name: str = 'Sheet1'):
"""Exports the data fusion artifacts to a file"""
Expand Down
2 changes: 1 addition & 1 deletion chemfusekit/knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class KNN(BaseClassifier):
def __init__(self, settings: KNNSettings, data: BaseDataModel):
super().__init__(settings, data)

def knn(self):
def train(self):
"""Performs k-Nearest Neighbors Analysis"""
# Prepare and train the kNN model
knn = KNeighborsClassifier(
Expand Down
4 changes: 2 additions & 2 deletions chemfusekit/lda.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self, settings: LDASettings, data: BaseDataModel):
if isinstance(data, PCADataModel):
self.settings.components = data.components - 1

def lda(self):
def train(self):
"""Performs Linear Discriminant Analysis"""

lda = LD(n_components=self.settings.components) # N-1 where N are the classes
Expand Down Expand Up @@ -144,7 +144,7 @@ def rescaled_data(self) -> BaseDataModel:
settings_backup = copy(self.settings)
self.settings.output = GraphMode.NONE
self.settings.test_split = False
self.lda()
self.train()
self.settings = settings_backup

x_data = pd.DataFrame(self.model.transform(self.data.x_data))
Expand Down
2 changes: 1 addition & 1 deletion chemfusekit/lr.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __init__(self, settings: LRSettings, data: BaseDataModel):
else:
self.array_scores = data.x_train.drop('Substance', axis=1).values

def lr(self):
def train(self):
"""Performs Logistic Regression"""

# Let's build our model on the training set
Expand Down
4 changes: 2 additions & 2 deletions chemfusekit/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def __init__(self, settings: PCASettings, data: BaseDataModel):
self.model: Optional[PC] = None
self.array_scores: Optional[np.ndarray] = None

def pca(self):
def train(self):
"""Performs Principal Component Analysis."""

# Read from the data fusion object
Expand Down Expand Up @@ -290,7 +290,7 @@ def rescaled_data(self) -> PCADataModel:
if self.model is None:
settings_backup = copy(self.settings)
self.settings.output = GraphMode.NONE
self.pca()
self.train()
self.pca_stats()
self.settings = settings_backup

Expand Down
2 changes: 1 addition & 1 deletion chemfusekit/plsda.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class PLSDA(BaseClassifier):
def __init__(self, settings: PLSDASettings, data: BaseDataModel):
super().__init__(settings, data)

def plsda(self):
def train(self):
"""Performs Partial Least Squares Discriminant Analysis"""
x = self.data.x_data
y = self.data.x_train.Substance.astype('category').cat.codes
Expand Down
2 changes: 1 addition & 1 deletion chemfusekit/svm.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class SVM(BaseClassifier):
def __init__(self, settings: SVMSettings, data: BaseDataModel):
super().__init__(settings, data)

def svm(self):
def train(self):
"""Performs Support Vector Machine analysis"""

# Linear kernel
Expand Down
16 changes: 8 additions & 8 deletions docs/cookbook/case-study-data-fusion.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,10 @@ On the other hand, the GC data does not require any preprocessing. The data obta
The most significant aspect of data preprocessing in this case study is data fusion. The three tables contained in the Excel datasheet are concatenated row-wise to form a single table that contains the data from the IMS and QEPAS spectrometers, as well as the GC retention times.

```python
from chemfusekit.lldf import LLDFSettings, LLDF, GraphMode, Table
from chemfusekit.df import DFSettings, DF, GraphMode, Table

# Initialize the settings to produce graphical output for the operation
settings = LLDFSettings(output=GraphMode.GRAPHIC)
settings = DFSettings(output=GraphMode.GRAPHIC)

# Set up the import settings for the first table (IMS spectral data)
table1 = Table(
Expand Down Expand Up @@ -126,7 +126,7 @@ table3 = Table(
tables = [table1, table2, table3]

# Let's pass the settings and the tables to the LLDF constructor
lldf = LLDF(settings, tables)
lldf = DF(settings, tables)

# Let's finally perform data fusion with the lldf() method!
lldf.lldf()
Expand Down Expand Up @@ -156,7 +156,7 @@ pca_settings = PCASettings(

# Initialize and run PCA on the fused dataset
pca = PCA(pca_settings, fused_data)
pca.pca()
pca.train()

# Run the tests and statistics
pca.pca_stats()
Expand All @@ -182,7 +182,7 @@ lr_settings = LRSettings(output=GraphMode.GRAPHIC, test_split=True)

# Initialize and train LR
lr = LR(lr_settings, reduced_dataset)
lr.lr()
lr.train()
```

### Model evaluation
Expand All @@ -206,12 +206,12 @@ pca.export_model("DMMP_acetone_pca.sklearn")
In the future, when we need to classify DMMP and acetone on a new dataset, we can simply import the new dataset, perform the necessary data fusion, reduce the dimensionality through PCA, import the pre-trained `LR` model, and use it to classify the data. This streamlined process allows for efficient and consistent classification of DMMP and acetone samples.

```python
from chemfusekit.lldf import LLDFSettings, LLDF, GraphMode, Table
from chemfusekit.df import DFSettings, DF, GraphMode, Table
from chemfusekit.pca import PCASettings, PCA
from chemfusekit.lr import LRSettings, LR

# Data fusion
lldf_settings = LLDFSettings(output=GraphMode.GRAPHIC)
lldf_settings = DFSettings(output=GraphMode.GRAPHIC)
table1 = Table(
file_path='new_dataset.xlsx',
sheet_name='IMS',
Expand All @@ -234,7 +234,7 @@ table3 = Table(
index_column='Sample_id'
)
tables = [table1, table2, table3]
lldf = LLDF(lldf_settings, tables)
lldf = DF(lldf_settings, tables)
lldf.lldf()
fused_data = lldf.fused_data

Expand Down
6 changes: 3 additions & 3 deletions docs/cookbook/data-operations.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,9 @@ Even though the header names are slightly different, the content of the first tw
The `LLDF` module allows us to join these two tables (the current and the one from the previous examples) to form a single dataset that contains both spectral data and retention times. Let's see how.

```python
from chemfusekit.lldf import LLDFSettings, LLDF, GraphMode, Table
from chemfusekit.df import DFSettings, DF, GraphMode, Table

settings = LLDFSettings() # Initialize the default settings
settings = DFSettings() # Initialize the default settings

# Set up the import settings for the first table (spectral data)
table1 = Table(
Expand All @@ -173,7 +173,7 @@ table2 = Table(
tables = [Table1, Table2]

# Let's pass the settings and the tables to the LLDF constructor
lldf = LLDF(settings, tables)
lldf = DF(settings, tables)

# Let's finally perform data fusion with the lldf() method!
lldf.lldf()
Expand Down
102 changes: 62 additions & 40 deletions docs/cookbook/structure.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ Let's assume your new data is called `new_data`. Knowing that the training data,

```python
knn.data = new_data
knn.knn()
knn.train()
```

The training method is always called like its container class, but in lower case. To train a `KNN` model, like in this case, you just have to call `.knn()` on it. Same goes for `.lda()` on `LDA`, `.lldf()` on `LLDF`, and so on.
Expand Down Expand Up @@ -124,65 +124,86 @@ The classifiers themselves all inherit from a base class called [`BaseClassifier

```mermaid
classDiagram
<<Abstract>> BaseActionClass
class BaseActionClass {
__init__(settings, data)
+settings: BaseSettings
+data: BaseDataModel
+model: sklearn model
import_model(import_path: str)
export_model(export_path: str)
}
BaseDataModel <|-- LLDFDataModel
BaseDataModel <|-- ComponentDataModel
ComponentDataModel <|.. PCADataModel
ComponentDataModel <|.. LDADataModel
<<Abstract>> BaseClassifier
class BaseClassifier {
+settings: BaseSettings
__init__(settings, data)
predict(x_data: pd.DataFrame)
class BaseDataModel {
x_data
x_train
y
load_from_file()
export_to_file()
}
<<Abstract>> BaseReducer
class BaseReducer {
+export_data()
+reduce()
<<abstract>> ComponentDataModel
class ComponentDataModel {
n_components
}
class KNN {
...
class PCADataModel {
array_scores
}
class LDA {
...
class LDADataModel {
}
class LR {
...
class LLDFDataModel {
tables
}
class PLSDA {
...
BaseActionClass <|.. BaseReducer
BaseActionClass <|.. BaseClassifier
BaseDataModel *-- BaseActionClass
<<abstract>> BaseActionClass
class BaseActionClass {
train()
settings
data ~BaseDataModel~
model
from_file()
import_model()
export_model()
}
class SVM {
...
BaseReducer <|.. PCA
BaseReducer <|.. LDA
BaseReducer <|.. PLSDA
<<abstract>> BaseReducer
class BaseReducer {
components
rescaled_data
export_data()
reduce()
}
<<abstract>> BaseClassifier
class BaseClassifier {
predict()
}
BaseClassifier <| .. LDA
BaseClassifier <| .. LR
BaseClassifier <| .. SVM
BaseClassifier <| .. KNN
BaseClassifier <| .. PLSDA
class PCA {
pca_stats()
}
BaseClassifier <|.. KNN
BaseClassifier <|.. LDA
BaseClassifier <|.. LR
BaseClassifier <|.. PLSDA
BaseClassifier <|.. SVM
BaseReducer <|.. PCA
BaseReducer <|.. LDA
class LLDF {
fuse_data()
}
BaseActionClass <|.. BaseReducer
BaseActionClass <|.. BaseClassifier
class LR {
array_scores
}
```


Expand Down Expand Up @@ -216,6 +237,7 @@ classDiagram

This allows all the classifiers to use the `LLDF` data, dimension-reduced `PCA` data, or any other type of data as long as it follows the `BaseDataModel` template.


## File import and export

All the data models (`BaseDataModel`, and its derived, `LLDFDataModel` and `PCADataModel`) can export their content to Excel tables.
Expand Down
File renamed without changes.
Loading

0 comments on commit 2e836bc

Please sign in to comment.