refactor!: give train() function to BaseActionClass and children

f-aguzzi · Jun 14, 2024 · 2e836bc · 2e836bc
1 parent 77206b5
commit 2e836bc
Show file tree

Hide file tree

Showing 32 changed files with 331 additions and 305 deletions.
diff --git a/chemfusekit/__base.py b/chemfusekit/__base.py
@@ -114,6 +114,11 @@ def __init__(self, settings: BaseSettings, data: BaseDataModel):
         self.data = data
         self.model: BaseEstimator | None = None
 
+    @abstractmethod
+    def train(self):
+        """Trains the estimator model."""
+        pass
+
     @classmethod
     def from_file(cls, settings, model_path):
         """Creates a classifier instance from file"""

diff --git a/chemfusekit/lldf.py → chemfusekit/df.py b/chemfusekit/lldf.py → chemfusekit/df.py
@@ -21,14 +21,14 @@ def __init__(self, file_path: str, sheet_name: str, preprocessing: str, class_co
         self.index_column = index_column
 
 
-class LLDFDataModel(BaseDataModel):
-    """Models the output data from the LLDF operation"""
+class DFDataModel(BaseDataModel):
+    """Models the output data from the DF operation"""
     def __init__(self, x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray):
         super().__init__(x_data, x_train, y)
 
 
-class LLDFSettings(BaseSettings):
-    """Holds the settings for the LLDF object."""
+class DFSettings(BaseSettings):
+    """Holds the settings for the DF object."""
     def __init__(self, output: GraphMode = GraphMode.NONE):
         super().__init__(output)
 
@@ -47,15 +47,15 @@ def _snv(input_data: np.ndarray):
     return output_data
 
 
-class LLDF:
+class DF:
     """Holds together all the data, methods and artifacts of the LLDF operation"""
-    def __init__(self, settings: LLDFSettings, tables: List[Table]):
+    def __init__(self, settings: DFSettings, tables: List[Table]):
         self.settings = settings
         self.tables = tables
-        self.fused_data: Optional[LLDFDataModel] = None
+        self.fused_data: Optional[DFDataModel] = None
 
-    def lldf(self):
-        """Performs low-level data fusion"""
+    def fuse(self):
+        """Performs data fusion"""
         x_vector = []
         for table in self.tables:
             try:
@@ -114,7 +114,7 @@ def lldf(self):
                 preprocessed_x = x
             else:
                 raise SyntaxError(
-                    f"LLDF: this type of preprocessing does not exist ({table.preprocessing=})"
+                    f"DF: this type of preprocessing does not exist ({table.preprocessing=})"
                 )
 
             if self.settings.output is GraphMode.GRAPHIC:
@@ -182,7 +182,7 @@ def lldf(self):
             axis=1
         )
 
-        self.fused_data = LLDFDataModel(x_data, x_train, y)
+        self.fused_data = DFDataModel(x_data, x_train, y)
 
     def export_data(self, export_path: str, sheet_name: str = 'Sheet1'):
         """Exports the data fusion artifacts to a file"""

diff --git a/chemfusekit/knn.py b/chemfusekit/knn.py
@@ -38,7 +38,7 @@ class KNN(BaseClassifier):
     def __init__(self, settings: KNNSettings, data: BaseDataModel):
         super().__init__(settings, data)
 
-    def knn(self):
+    def train(self):
         """Performs k-Nearest Neighbors Analysis"""
         # Prepare and train the kNN model
         knn = KNeighborsClassifier(

diff --git a/chemfusekit/lda.py b/chemfusekit/lda.py
@@ -42,7 +42,7 @@ def __init__(self, settings: LDASettings, data: BaseDataModel):
         if isinstance(data, PCADataModel):
             self.settings.components = data.components - 1
 
-    def lda(self):
+    def train(self):
         """Performs Linear Discriminant Analysis"""
 
         lda = LD(n_components=self.settings.components)  # N-1 where N are the classes
@@ -144,7 +144,7 @@ def rescaled_data(self) -> BaseDataModel:
             settings_backup = copy(self.settings)
             self.settings.output = GraphMode.NONE
             self.settings.test_split = False
-            self.lda()
+            self.train()
             self.settings = settings_backup
 
         x_data = pd.DataFrame(self.model.transform(self.data.x_data))

diff --git a/chemfusekit/lr.py b/chemfusekit/lr.py
@@ -35,7 +35,7 @@ def __init__(self, settings: LRSettings, data: BaseDataModel):
         else:
             self.array_scores = data.x_train.drop('Substance', axis=1).values
 
-    def lr(self):
+    def train(self):
         """Performs Logistic Regression"""
 
         # Let's build our model on the training set

diff --git a/chemfusekit/pca.py b/chemfusekit/pca.py
@@ -52,7 +52,7 @@ def __init__(self, settings: PCASettings, data: BaseDataModel):
         self.model: Optional[PC] = None
         self.array_scores: Optional[np.ndarray] = None
 
-    def pca(self):
+    def train(self):
         """Performs Principal Component Analysis."""
 
         # Read from the data fusion object
@@ -290,7 +290,7 @@ def rescaled_data(self) -> PCADataModel:
         if self.model is None:
             settings_backup = copy(self.settings)
             self.settings.output = GraphMode.NONE
-            self.pca()
+            self.train()
             self.pca_stats()
             self.settings = settings_backup
 

diff --git a/chemfusekit/plsda.py b/chemfusekit/plsda.py
@@ -29,7 +29,7 @@ class PLSDA(BaseClassifier):
     def __init__(self, settings: PLSDASettings, data: BaseDataModel):
         super().__init__(settings, data)
 
-    def plsda(self):
+    def train(self):
         """Performs Partial Least Squares Discriminant Analysis"""
         x = self.data.x_data
         y = self.data.x_train.Substance.astype('category').cat.codes

diff --git a/chemfusekit/svm.py b/chemfusekit/svm.py
@@ -21,7 +21,7 @@ class SVM(BaseClassifier):
     def __init__(self, settings: SVMSettings, data: BaseDataModel):
         super().__init__(settings, data)
 
-    def svm(self):
+    def train(self):
         """Performs Support Vector Machine analysis"""
 
         # Linear kernel

diff --git a/docs/cookbook/case-study-data-fusion.md b/docs/cookbook/case-study-data-fusion.md
@@ -90,10 +90,10 @@ On the other hand, the GC data does not require any preprocessing. The data obta
 The most significant aspect of data preprocessing in this case study is data fusion. The three tables contained in the Excel datasheet are concatenated row-wise to form a single table that contains the data from the IMS and QEPAS spectrometers, as well as the GC retention times.
 
 ```python
-from chemfusekit.lldf import LLDFSettings, LLDF, GraphMode, Table
+from chemfusekit.df import DFSettings, DF, GraphMode, Table
 
 # Initialize the settings to produce graphical output for the operation
-settings = LLDFSettings(output=GraphMode.GRAPHIC)
+settings = DFSettings(output=GraphMode.GRAPHIC)
 
 # Set up the import settings for the first table (IMS spectral data)
 table1 = Table(
@@ -126,7 +126,7 @@ table3 = Table(
 tables = [table1, table2, table3]
 
 # Let's pass the settings and the tables to the LLDF constructor
-lldf = LLDF(settings, tables)
+lldf = DF(settings, tables)
 
 # Let's finally perform data fusion with the lldf() method!
 lldf.lldf()
@@ -156,7 +156,7 @@ pca_settings = PCASettings(
 
 # Initialize and run PCA on the fused dataset
 pca = PCA(pca_settings, fused_data)
-pca.pca()
+pca.train()
 
 # Run the tests and statistics
 pca.pca_stats()
@@ -182,7 +182,7 @@ lr_settings = LRSettings(output=GraphMode.GRAPHIC, test_split=True)
 
 # Initialize and train LR
 lr = LR(lr_settings, reduced_dataset)
-lr.lr()
+lr.train()
 ```
 
 ### Model evaluation
@@ -206,12 +206,12 @@ pca.export_model("DMMP_acetone_pca.sklearn")
 In the future, when we need to classify DMMP and acetone on a new dataset, we can simply import the new dataset, perform the necessary data fusion, reduce the dimensionality through PCA, import the pre-trained `LR` model, and use it to classify the data. This streamlined process allows for efficient and consistent classification of DMMP and acetone samples.
 
 ```python
-from chemfusekit.lldf import LLDFSettings, LLDF, GraphMode, Table
+from chemfusekit.df import DFSettings, DF, GraphMode, Table
 from chemfusekit.pca import PCASettings, PCA
 from chemfusekit.lr import LRSettings, LR
 
 # Data fusion
-lldf_settings = LLDFSettings(output=GraphMode.GRAPHIC)
+lldf_settings = DFSettings(output=GraphMode.GRAPHIC)
 table1 = Table(
     file_path='new_dataset.xlsx',
     sheet_name='IMS',
@@ -234,7 +234,7 @@ table3 = Table(
     index_column='Sample_id'
 )
 tables = [table1, table2, table3]
-lldf = LLDF(lldf_settings, tables)
+lldf = DF(lldf_settings, tables)
 lldf.lldf()
 fused_data = lldf.fused_data
 

diff --git a/docs/cookbook/data-operations.md b/docs/cookbook/data-operations.md
@@ -148,9 +148,9 @@ Even though the header names are slightly different, the content of the first tw
 The `LLDF` module allows us to join these two tables (the current and the one from the previous examples) to form a single dataset that contains both spectral data and retention times. Let's see how.
 
 ```python
-from chemfusekit.lldf import LLDFSettings, LLDF, GraphMode, Table
+from chemfusekit.df import DFSettings, DF, GraphMode, Table
 
-settings = LLDFSettings()   # Initialize the default settings
+settings = DFSettings()  # Initialize the default settings
 
 # Set up the import settings for the first table (spectral data)
 table1 = Table(
@@ -173,7 +173,7 @@ table2 = Table(
 tables = [Table1, Table2]
 
 # Let's pass the settings and the tables to the LLDF constructor
-lldf = LLDF(settings, tables)
+lldf = DF(settings, tables)
 
 # Let's finally perform data fusion with the lldf() method!
 lldf.lldf()

diff --git a/docs/cookbook/structure.md b/docs/cookbook/structure.md
@@ -72,7 +72,7 @@ Let's assume your new data is called `new_data`. Knowing that the training data,
 
 ```python
 knn.data = new_data
-knn.knn()
+knn.train()
 ```
 
 The training method is always called like its container class, but in lower case. To train a `KNN` model, like in this case, you just have to call `.knn()` on it. Same goes for `.lda()` on `LDA`, `.lldf()` on `LLDF`, and so on.
@@ -124,65 +124,86 @@ The classifiers themselves all inherit from a base class called [`BaseClassifier
 
 ```mermaid
 classDiagram
-
-    <<Abstract>> BaseActionClass
-    class BaseActionClass {
-        __init__(settings, data)
-        +settings: BaseSettings
-        +data: BaseDataModel
-        +model: sklearn model
-        import_model(import_path: str)
-        export_model(export_path: str)
-    }
+    BaseDataModel <|-- LLDFDataModel
+    BaseDataModel <|-- ComponentDataModel
+    ComponentDataModel <|.. PCADataModel
+    ComponentDataModel <|.. LDADataModel
     
-    <<Abstract>> BaseClassifier
-    class BaseClassifier {
-        +settings: BaseSettings
-        __init__(settings, data)
-        predict(x_data: pd.DataFrame)
+    class BaseDataModel {
+        x_data
+        x_train
+        y
+        load_from_file()
+        export_to_file()
     }
 
-    <<Abstract>> BaseReducer
-    class BaseReducer {
-        +export_data()
-        +reduce()
+    <<abstract>> ComponentDataModel
+    class ComponentDataModel {
+        n_components
     }
 
-    class KNN {
-        ...
+    class PCADataModel {
+        array_scores
     }
 
-    class LDA {
-        ...
+    class LDADataModel {
+
     }
 
-    class LR {
-        ...
+    class LLDFDataModel {
+        tables
     }
 
-    class PLSDA {
-        ...
+    BaseActionClass <|.. BaseReducer
+    BaseActionClass <|.. BaseClassifier
+
+    BaseDataModel *-- BaseActionClass
+
+    <<abstract>> BaseActionClass
+    class BaseActionClass {
+        train()
+        settings
+        data ~BaseDataModel~
+        model
+        from_file()
+        import_model()
+        export_model()
     }
 
-    class SVM {
-        ...
+    BaseReducer <|.. PCA
+    BaseReducer <|.. LDA
+    BaseReducer <|.. PLSDA
+
+    <<abstract>> BaseReducer
+    class BaseReducer {
+        components
+        rescaled_data
+        export_data()
+        reduce()
     }
 
+    <<abstract>> BaseClassifier
+    class BaseClassifier {
+        predict()
+    }
+
+    BaseClassifier <| .. LDA
+    BaseClassifier <| .. LR
+    BaseClassifier <| .. SVM
+    BaseClassifier <| .. KNN
+    BaseClassifier <| .. PLSDA
+
     class PCA {
         pca_stats()
     }
 
-    BaseClassifier <|.. KNN
-    BaseClassifier <|.. LDA
-    BaseClassifier <|.. LR
-    BaseClassifier <|.. PLSDA
-    BaseClassifier <|.. SVM
-
-    BaseReducer <|.. PCA
-    BaseReducer <|.. LDA
+    class LLDF {
+        fuse_data()
+    }
 
-    BaseActionClass <|.. BaseReducer
-    BaseActionClass <|.. BaseClassifier
+    class LR {
+        array_scores
+    }
 ```
 
 
@@ -216,6 +237,7 @@ classDiagram
 
 This allows all the classifiers to use the `LLDF` data, dimension-reduced `PCA` data, or any other type of data as long as it follows the `BaseDataModel` template.
 
+
 ## File import and export
 
 All the data models (`BaseDataModel`, and its derived, `LLDFDataModel` and `PCADataModel`) can export their content to Excel tables.

diff --git a/docs/docs/lldf/_category_.json → docs/docs/df/_category_.json b/docs/docs/lldf/_category_.json → docs/docs/df/_category_.json