feat(DF): mid-level data fusion support

f-aguzzi · Jun 14, 2024 · 45a3ea4 · 45a3ea4
1 parent cf67a1a
commit 45a3ea4
Show file tree

Hide file tree

Showing 4 changed files with 75 additions and 16 deletions.
diff --git a/chemfusekit/df.py b/chemfusekit/df.py
@@ -8,23 +8,28 @@
 import matplotlib.pyplot as plt
 
 from .__base import GraphMode, BaseDataModel, BaseSettings
+from .pca import PCASettings, PCA
+from .plsda import PLSDASettings, PLSDA
 
 
 class Table:
     """Holds the path, preprocessing choice and sheet name for a single Excel table."""
-    def __init__(self, file_path: str, sheet_name: str, preprocessing: str, class_column: str = 'Substance',
-                 index_column: str | None = None):
+    def __init__(self, file_path: str, sheet_name: str, preprocessing: str, feature_selection: str | None = None,
+                 class_column: str = 'Substance', index_column: str | None = None):
         self.file_path = file_path
         self.sheet_name = sheet_name
         self.preprocessing = preprocessing
+        self.feature_selection = feature_selection
         self.class_column = class_column
         self.index_column = index_column
 
 
 class DFDataModel(BaseDataModel):
     """Models the output data from the DF operation"""
-    def __init__(self, x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray):
+    def __init__(self, x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray,
+                 tables: list[(Table, BaseDataModel)]):
         super().__init__(x_data, x_train, y)
+        self.tables = tables
 
 
 class DFSettings(BaseSettings):
@@ -57,6 +62,7 @@ def __init__(self, settings: DFSettings, tables: List[Table]):
     def fuse(self):
         """Performs data fusion"""
         x_vector = []
+        individual_tables = []
         for table in self.tables:
             try:
                 # Autodetect the format based on the file extension
@@ -85,14 +91,16 @@ def fuse(self):
 
             # select only numerical attributes
             if table.index_column is not None and table.index_column in table_data.columns:
-                x = table_data.drop(table.index_column, axis=1)
+                deindexed_table = table_data.drop(table.index_column, axis=1)
             else:
-                x = table_data.iloc[:, 1:]
+                deindexed_table = table_data
 
             if table.class_column in table_data.columns:
-                x = table_data.drop(table.class_column, axis=1)
+                x = deindexed_table.drop(table.class_column, axis=1)
+                y = deindexed_table[table.class_column]
             else:
-                x = x.iloc[:, 1:]
+                x = deindexed_table.iloc[:, 1:]
+                y = deindexed_table[:, 1]
 
             # It is necessary to convert the column names as string to select them
             x.columns = x.columns.astype(str)     # to make the colnames as text
@@ -116,6 +124,31 @@ def fuse(self):
                 raise SyntaxError(
                     f"DF: this type of preprocessing does not exist ({table.preprocessing=})"
                 )
+            preprocessed_x = pd.DataFrame(preprocessed_x)
+
+            # Save the temporary table as a BaseDataModel
+            x_train = pd.concat([y, preprocessed_x], axis=1)
+            table_data_model = BaseDataModel(
+                x_data=preprocessed_x,
+                x_train=x_train,
+                y=np.asarray(y)
+            )
+
+            # Feature reduction
+            if table.feature_selection is None:
+                reduced_table_data_model = table_data_model
+            elif table.feature_selection == 'pca':
+                pca = PCA(PCASettings(), table_data_model)
+                pca.train()
+                reduced_table_data_model = pca.reduce(table_data_model)
+            elif table.feature_selection == 'plsda':
+                plsda = PLSDA(PLSDASettings(), table_data_model)
+                plsda.train()
+                reduced_table_data_model = plsda.reduce(table_data_model)
+            else:
+                raise SyntaxError(
+                    f"DF: this type of feature selection does not exist ({table.feature_selection=})"
+                )
 
             if self.settings.output is GraphMode.GRAPHIC:
                 numbers_string = [str(col) for col in x.columns]
@@ -148,14 +181,15 @@ def fuse(self):
 
             # Create a new DataFrame with the processed numerical attributes
             processed_dataframe_x = pd.DataFrame(
-                preprocessed_x,
+                reduced_table_data_model.x_data,
                 columns=x.columns
             )
 
             # Reset the index of the dataframe
             processed_dataframe_x = processed_dataframe_x.reset_index(drop=True)
 
             x_vector.append(processed_dataframe_x)
+            individual_tables.append((table, table_data_model))
 
         try:
             table_data = pd.read_excel(
@@ -182,7 +216,7 @@ def fuse(self):
             axis=1
         )
 
-        self.fused_data = DFDataModel(x_data, x_train, y)
+        self.fused_data = DFDataModel(x_data, x_train, y, individual_tables)
 
     def export_data(self, export_path: str, sheet_name: str = 'Sheet1'):
         """Exports the data fusion artifacts to a file"""

diff --git a/docs/docs/df/dfmodel.md b/docs/docs/df/dfmodel.md
@@ -20,9 +20,12 @@ The first two are `Pandas` `DataFrame` objects:
 - `x_data`
 - `x_train`
 
-The last is a `NumPy` `ndarray`:
+The third is a `NumPy` `ndarray`:
 - `y`
 
+The last is a `list` of `tuple`s containing a `Table` and a `BaseDataModel`, representing the individual imported tables:
+- `tables`
+
 ## Methods
 
 Both methods are inherited from [`BaseDataModel`](../base/basedatamodel.md):

diff --git a/docs/docs/df/table.md b/docs/docs/df/table.md
@@ -12,10 +12,11 @@ The [`DF`](./df-class.md) object takes a list of `Table` as a parameter.
 
 ```python
 Table(
-    file_path: str
-    sheet_name: str
-    preprocessing: str
-    class_column: str
+    file_path: str,
+    sheet_name: str,
+    preprocessing: str,
+    feature_selection: str,
+    class_column: str,
     index_column: str | None
 )
 ```
@@ -26,8 +27,10 @@ Table(
 - `sheet_name`: a `str` containing the name of the sheet to select within the Excel file
 - `preprocessing`: a `str` with the name of the preprocessing to be applied to the table.
    Available options: `snv` (normalization), `savgol` (Savitski-Golay smoothing), `savgol+snv` (both), `none` (no processing).
-- 'class_column': a 'str' indicating the name of the class column within the Excel datasheet. Defaults to 'Substance'.
-- 'index_column': a 'str' | `None` indicating the name of the index column within the Excel datasheet. Defaults to `None` (and in that case, the first column will be treated as the index).
+- `feature_selection`: a `str`indicating the name of the feature extraction technique to be applied to the table, for the purpose of mid-level data fusion.
+   Available option: `pca` (Principal Component Analysis), `plsda` (Partial Least Squares Discriminant Analysis), `none` (for no feature extraction).
+- `class_column`: a `str` indicating the name of the class column within the Excel datasheet. Defaults to `Substance`.
+- `index_column`: a `str` | `None` indicating the name of the index column within the Excel datasheet. Defaults to `None` (and in that case, the first column will be treated as the index).
 
 ## Example
 

diff --git a/tests/test_df.py b/tests/test_df.py
@@ -70,6 +70,25 @@ def test_export(self):
         with self.assertRaises(ValueError):
             df.export_data('$£=0\//|')
 
+    def test_midlevel_data_fusion(self):
+        """Integration test case for mid-level data fusion."""
+        df_settings = DFSettings(output=GraphMode.GRAPHIC)
+        table1 = Table(
+            file_path='tests/qepas.xlsx',
+            sheet_name='Sheet1',
+            preprocessing='snv',
+            feature_selection='pca'
+        )
+        table2 = Table(
+            file_path='tests/rt.xlsx',
+            sheet_name='Sheet1',
+            preprocessing='none'
+        )
+
+        tables = [table1, table2]
+
+        df = DF(df_settings, tables)
+        df.fuse()
 
 if __name__ == '__main__':
     unittest.main()