Skip to content

Commit

Permalink
feat(DF): mid-level data fusion support
Browse files Browse the repository at this point in the history
  • Loading branch information
f-aguzzi committed Jun 14, 2024
1 parent cf67a1a commit 45a3ea4
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 16 deletions.
52 changes: 43 additions & 9 deletions chemfusekit/df.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,28 @@
import matplotlib.pyplot as plt

from .__base import GraphMode, BaseDataModel, BaseSettings
from .pca import PCASettings, PCA
from .plsda import PLSDASettings, PLSDA


class Table:
"""Holds the path, preprocessing choice and sheet name for a single Excel table."""
def __init__(self, file_path: str, sheet_name: str, preprocessing: str, class_column: str = 'Substance',
index_column: str | None = None):
def __init__(self, file_path: str, sheet_name: str, preprocessing: str, feature_selection: str | None = None,
class_column: str = 'Substance', index_column: str | None = None):
self.file_path = file_path
self.sheet_name = sheet_name
self.preprocessing = preprocessing
self.feature_selection = feature_selection
self.class_column = class_column
self.index_column = index_column


class DFDataModel(BaseDataModel):
"""Models the output data from the DF operation"""
def __init__(self, x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray):
def __init__(self, x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray,
tables: list[(Table, BaseDataModel)]):
super().__init__(x_data, x_train, y)
self.tables = tables


class DFSettings(BaseSettings):
Expand Down Expand Up @@ -57,6 +62,7 @@ def __init__(self, settings: DFSettings, tables: List[Table]):
def fuse(self):
"""Performs data fusion"""
x_vector = []
individual_tables = []
for table in self.tables:
try:
# Autodetect the format based on the file extension
Expand Down Expand Up @@ -85,14 +91,16 @@ def fuse(self):

# select only numerical attributes
if table.index_column is not None and table.index_column in table_data.columns:
x = table_data.drop(table.index_column, axis=1)
deindexed_table = table_data.drop(table.index_column, axis=1)
else:
x = table_data.iloc[:, 1:]
deindexed_table = table_data

if table.class_column in table_data.columns:
x = table_data.drop(table.class_column, axis=1)
x = deindexed_table.drop(table.class_column, axis=1)
y = deindexed_table[table.class_column]
else:
x = x.iloc[:, 1:]
x = deindexed_table.iloc[:, 1:]
y = deindexed_table[:, 1]

# It is necessary to convert the column names as string to select them
x.columns = x.columns.astype(str) # to make the colnames as text
Expand All @@ -116,6 +124,31 @@ def fuse(self):
raise SyntaxError(
f"DF: this type of preprocessing does not exist ({table.preprocessing=})"
)
preprocessed_x = pd.DataFrame(preprocessed_x)

# Save the temporary table as a BaseDataModel
x_train = pd.concat([y, preprocessed_x], axis=1)
table_data_model = BaseDataModel(
x_data=preprocessed_x,
x_train=x_train,
y=np.asarray(y)
)

# Feature reduction
if table.feature_selection is None:
reduced_table_data_model = table_data_model
elif table.feature_selection == 'pca':
pca = PCA(PCASettings(), table_data_model)
pca.train()
reduced_table_data_model = pca.reduce(table_data_model)
elif table.feature_selection == 'plsda':
plsda = PLSDA(PLSDASettings(), table_data_model)
plsda.train()
reduced_table_data_model = plsda.reduce(table_data_model)
else:
raise SyntaxError(
f"DF: this type of feature selection does not exist ({table.feature_selection=})"
)

if self.settings.output is GraphMode.GRAPHIC:
numbers_string = [str(col) for col in x.columns]
Expand Down Expand Up @@ -148,14 +181,15 @@ def fuse(self):

# Create a new DataFrame with the processed numerical attributes
processed_dataframe_x = pd.DataFrame(
preprocessed_x,
reduced_table_data_model.x_data,
columns=x.columns
)

# Reset the index of the dataframe
processed_dataframe_x = processed_dataframe_x.reset_index(drop=True)

x_vector.append(processed_dataframe_x)
individual_tables.append((table, table_data_model))

try:
table_data = pd.read_excel(
Expand All @@ -182,7 +216,7 @@ def fuse(self):
axis=1
)

self.fused_data = DFDataModel(x_data, x_train, y)
self.fused_data = DFDataModel(x_data, x_train, y, individual_tables)

def export_data(self, export_path: str, sheet_name: str = 'Sheet1'):
"""Exports the data fusion artifacts to a file"""
Expand Down
5 changes: 4 additions & 1 deletion docs/docs/df/dfmodel.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@ The first two are `Pandas` `DataFrame` objects:
- `x_data`
- `x_train`

The last is a `NumPy` `ndarray`:
The third is a `NumPy` `ndarray`:
- `y`

The last is a `list` of `tuple`s containing a `Table` and a `BaseDataModel`, representing the individual imported tables:
- `tables`

## Methods

Both methods are inherited from [`BaseDataModel`](../base/basedatamodel.md):
Expand Down
15 changes: 9 additions & 6 deletions docs/docs/df/table.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@ The [`DF`](./df-class.md) object takes a list of `Table` as a parameter.

```python
Table(
file_path: str
sheet_name: str
preprocessing: str
class_column: str
file_path: str,
sheet_name: str,
preprocessing: str,
feature_selection: str,
class_column: str,
index_column: str | None
)
```
Expand All @@ -26,8 +27,10 @@ Table(
- `sheet_name`: a `str` containing the name of the sheet to select within the Excel file
- `preprocessing`: a `str` with the name of the preprocessing to be applied to the table.
Available options: `snv` (normalization), `savgol` (Savitski-Golay smoothing), `savgol+snv` (both), `none` (no processing).
- 'class_column': a 'str' indicating the name of the class column within the Excel datasheet. Defaults to 'Substance'.
- 'index_column': a 'str' | `None` indicating the name of the index column within the Excel datasheet. Defaults to `None` (and in that case, the first column will be treated as the index).
- `feature_selection`: a `str`indicating the name of the feature extraction technique to be applied to the table, for the purpose of mid-level data fusion.
Available option: `pca` (Principal Component Analysis), `plsda` (Partial Least Squares Discriminant Analysis), `none` (for no feature extraction).
- `class_column`: a `str` indicating the name of the class column within the Excel datasheet. Defaults to `Substance`.
- `index_column`: a `str` | `None` indicating the name of the index column within the Excel datasheet. Defaults to `None` (and in that case, the first column will be treated as the index).

## Example

Expand Down
19 changes: 19 additions & 0 deletions tests/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,25 @@ def test_export(self):
with self.assertRaises(ValueError):
df.export_data('$£=0\//|')

def test_midlevel_data_fusion(self):
"""Integration test case for mid-level data fusion."""
df_settings = DFSettings(output=GraphMode.GRAPHIC)
table1 = Table(
file_path='tests/qepas.xlsx',
sheet_name='Sheet1',
preprocessing='snv',
feature_selection='pca'
)
table2 = Table(
file_path='tests/rt.xlsx',
sheet_name='Sheet1',
preprocessing='none'
)

tables = [table1, table2]

df = DF(df_settings, tables)
df.fuse()

if __name__ == '__main__':
unittest.main()

0 comments on commit 45a3ea4

Please sign in to comment.