Skip to content

Commit

Permalink
Merge pull request #38 from f-aguzzi/pre/beta
Browse files Browse the repository at this point in the history
Pre/beta into main: release 2.1.0
  • Loading branch information
f-aguzzi authored Jun 7, 2024
2 parents 3fdd0b8 + 3601750 commit 56ffd1e
Show file tree
Hide file tree
Showing 60 changed files with 1,723 additions and 16 deletions.
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
## [2.1.0-beta.1](https://github.com/f-aguzzi/tesi/compare/v2.0.0...v2.1.0-beta.1) (2024-06-05)


### Features

* **import/export:** fully functional file dumping ([e1d0044](https://github.com/f-aguzzi/tesi/commit/e1d004448afd86f4ffa2ed4b87629e6798ef41b2))


### chore

* **license:** add GPLv3 license ([3fdd0b8](https://github.com/f-aguzzi/tesi/commit/3fdd0b87b6587b7413dd36f5101d37a5d712e7d7))

## [2.0.0](https://github.com/f-aguzzi/tesi/compare/v1.2.0...v2.0.0) (2024-06-04)


Expand Down
45 changes: 43 additions & 2 deletions chemfusekit/__base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,56 @@
import pandas as pd
import numpy as np
import joblib

from sklearn.base import BaseEstimator

from .__utils import GraphMode


class BaseDataModel:
'''Models the output data from data-outputting operations'''

def __init__(self, x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray):
self.x_data = x_data
self.x_train = x_train
self.y = y

@classmethod
def load_from_file(cls, import_path: str, sheet_name: str = 'Sheet1'):
try:
table_data = pd.read_excel(
import_path,
sheet_name=sheet_name,
index_col=0,
header=0
)
except Exception as exc:
raise FileNotFoundError("Error opening the selected files.") from exc

x = table_data.iloc[:, 1:]

# It is necessary to convert the column names as string to select them
x.columns = x.columns.astype(str) # to make the colnames as text

y = table_data.loc[:, 'Substance'].values
y_dataframe = pd.DataFrame(y, columns=['Substance'])
x_train = pd.concat(
[y_dataframe, x],
axis=1
)

return cls(x, x_train, y)

def export_to_file(self, export_path: str, sheet_name: str = 'Sheet1'):
try:
self.x_train.to_excel(excel_writer=export_path, sheet_name=sheet_name)
except Exception as exc:
raise RuntimeError("Could not export data to the selected path.") from exc


class BaseSettings:
'''Holds the settings for the BaseClassifier object.'''

def __init__(self, output: GraphMode = GraphMode.NONE, test_split: bool = False):
if test_split is True and output is GraphMode.NONE:
raise Warning(
Expand All @@ -27,13 +64,17 @@ def __init__(self, output: GraphMode = GraphMode.NONE, test_split: bool = False)

class BaseClassifier:
'''Parent class for all classifiers, containing basic shared utilities.'''

def __init__(self, settings: BaseSettings, data: BaseDataModel):
self.settings = settings
self.data = data
self.model = None
self.model: BaseEstimator | None = None

def import_model(self, import_path: str):
joblib.load(self.model, import_path)
model = joblib.load(import_path)
if not isinstance(model, BaseEstimator):
raise ImportError("The file you tried importing is not a sklearn model!")
self.model = model

def export_model(self, export_path: str):
if self.model is not None:
Expand Down
12 changes: 12 additions & 0 deletions chemfusekit/knn.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
'''k-Nearest Neighbors Analysis module'''
from copy import copy
from typing import Optional
from beartype.typing import Callable

Expand Down Expand Up @@ -79,3 +80,14 @@ def knn(self):
algorithm=self.settings.algorithm
)
run_split_test(self.data.x_data, self.data.y, knn_split)

def import_model(self, import_path: str):
model_backup = copy(self.model)
super().import_model(import_path)
if not isinstance(self.model, KNeighborsClassifier):
self.model = model_backup
raise ImportError("The file you tried to import is not a KNeighborsClassifier.")
self.settings.n_neighbors = self.model.n_neighbors
self.settings.metric = self.model.metric
self.settings.weights = self.model.weights
self.settings.algorithm = self.model.algorithm
10 changes: 10 additions & 0 deletions chemfusekit/lda.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
'''Linear Discriminant Analysis module'''
from copy import copy
from typing import Optional

import joblib
import numpy as np
import pandas as pd

Expand Down Expand Up @@ -108,3 +110,11 @@ def lda(self):
LD(n_components=self.settings.components),
mode=self.settings.output
)

def import_model(self, import_path: str):
model_backup = copy(self.model)
super().import_model(import_path)
if not isinstance(self.model, LD):
self.model = model_backup
raise ImportError("The file you tried to import is not a LinearDiscriminantAnalysis classifier.")
self.settings.components = self.model.n_components
9 changes: 2 additions & 7 deletions chemfusekit/lldf.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,14 +139,9 @@ def lldf(self):

self.fused_data = LLDFDataModel(x_data, x_train, y)

def export_data(self, export_path: str):
def export_data(self, export_path: str, sheet_name: str = 'Sheet1'):
'''Exports the data fusion artifacts to a file'''
if self.fused_data is None:
raise RuntimeError("Cannot export data before data fusion.")

x_train_dataframe = pd.DataFrame(self.fused_data.x_train)

try:
x_train_dataframe.to_excel(export_path)
except Exception as exc:
raise RuntimeError("Could not export data to the selected path.") from exc
self.fused_data.export_to_file(export_path=export_path, sheet_name=sheet_name)
9 changes: 9 additions & 0 deletions chemfusekit/lr.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
'''Logistic Regression Module'''
from copy import copy
from typing import Optional

import numpy as np
Expand Down Expand Up @@ -128,3 +129,11 @@ def predict(self, x_sample: pd.DataFrame):
)

return prediction

def import_model(self, import_path: str):
model_backup = copy(self.model)
super().import_model(import_path)
if not isinstance(self.model, LogisticRegression):
self.model = model_backup
raise ImportError("The file you tried to import is not a LogisticRegression classifier.")
self.settings.algorithm = self.model.solver
8 changes: 8 additions & 0 deletions chemfusekit/plsda.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,11 @@ def plsda(self):
x = self.data.x_data
y = self.data.x_train.Substance.astype('category').cat.codes
run_split_test(x, y, PLSR(self.settings.n_components), mode=self.settings.output)

def import_model(self, import_path: str):
model_backup = copy(self.model)
super().import_model(import_path)
if not isinstance(self.model, PLSR):
self.model = model_backup
raise ImportError("The file you tried to import is not a PLSRegression classifier.")
self.settings.n_components = self.model.n_components
9 changes: 9 additions & 0 deletions chemfusekit/svm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
'''Support Vector Machine module.'''
from copy import copy
from typing import Optional

import pandas as pd
Expand Down Expand Up @@ -60,3 +61,11 @@ def svm(self):
model=SVC(kernel=self.settings.kernel),
mode=self.settings.output
)

def import_model(self, import_path: str):
model_backup = copy(self.model)
super().import_model(import_path)
if not isinstance(self.model, SVC):
self.model = model_backup
raise ImportError("The file you tried to import is not an SVC classifier.")
self.settings.kernel = self.model.kernel
48 changes: 43 additions & 5 deletions docs/cookbook/structure.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,36 @@ As you can see, each module contains a class with the same name of the module, a

## Modular design features

The entire library was streamlined to make operations as smooth and easy as possible. Any operation (import and export of both data and classifier models, training, processing, prediction, ...) looks the same on any class.

<br />

> *Want to update the settings in a classifier?*
You'll find the settings for `LDA` in `LDA.settings`. And the settings of `PCA` in `PCA.settings`. Where are the settings for `SVM`? In `SVM.settings`, of course. You get the hang of it.

<br />

> *Want to inspect the underlying `sklearn` model in one of the classifiers?
Let's say you're using a `LR` object. Its underlying sklearn classifier is in `LR.model`, as much as the underlying sklearn classifier of `KNN` is in `KNN.model`.

<br />

> *Want to swap out the data in a model and retrain it?*
Let's assume your new data is called `new_data`. Knowing that the training data, when present, is located in the `.data` field, just do this:

```python
knn.data = new_data
knn.knn()
```

The training method is always called like its container class, but in lower case. To train a `KNN` model, like in this case, you just have to call `.knn()` on it. Same goes for `.lda()` on `LDA`, `.lldf()` on `LLDF`, and so on.


### Modular settings

The settings for all classifiers (that is, all classes except `LLDF` and `PCA`) inherit from a base class called [`BaseSettings`](/docs/base/basesettings) in the `base` module:

```mermaid
Expand Down Expand Up @@ -84,8 +114,9 @@ classDiagram
BaseSettings *-- SVMSettings
```

\
\

### Modular classifiers

The classifiers themselves all inherit from a base class called [`BaseClassifier`](/docs/base/baseclassifier) in the `base` module:

```mermaid
Expand Down Expand Up @@ -128,8 +159,9 @@ classDiagram
BaseClassifier *-- SVM
```

\
\

### Modular data types

The data types are modular and interexchangeable too. Both [`LLDFDataModel`](/docs/lldf/lldfmodel) and [`PCADataModel`](/docs/pca/pcadatamodel) inherit from [`BaseDataModel`](/docs/base/basedatamodel) as shown in the following diagram:

```mermaid
Expand All @@ -156,4 +188,10 @@ classDiagram
BaseDataModel *-- PCADataModel
```

This allows all the classifiers to use the `LLDF` data, dimension-reduced `PCA` data, or any other type of data as long as it follows the `BaseDataModel` template.
This allows all the classifiers to use the `LLDF` data, dimension-reduced `PCA` data, or any other type of data as long as it follows the `BaseDataModel` template.

## File import and export

All the data models (`BaseDataModel`, and its derived, `LLDFDataModel` and `PCADataModel`) can export their content to Excel tables.

All classifiers derived from `BaseClassifier` (`KNN`, `LDA`, `LR`, `PLSDA`, `SVM`) can import and export their sklearn data model from and to file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
sidebar_position: 3
---

# Case study: training a classifier from lab data

:::note
This case study is still **under construction**.
:::
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
sidebar_position: 4
---

# Case study: hybrid workflow

:::note
This case study is still **under construction**.
:::
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
sidebar_position: 5
---

# Case study: real-time data classification

:::note
This case study is still **under construction**.
:::
26 changes: 26 additions & 0 deletions docs/cookbook_versioned_docs/version-2.1.0/introduction.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
---
sidebar_position: 1
---

# The ChemFuseKit Cookbook: an introduction

*What is a cookbook, exactly?*

> A cookbook is a comprehensive collection of recipes that guide users through
the process of learning and mastering the use of a specific library or
programming technique, by providing step-by-step instructions, explanations and
examples.

## What you'll learn

In this cookbook you will learn the basic principles of operation of `ChemFuseKit` through practical examples and case studies. You will be shown that all modules follow a basic structure, and once you've learned it for one module, you will be able to reapply that knowledge for all modules.

You will be shown how to use the library on its own, and also how to use it as a part of a bigger pipeline.

## Cookbook sectioning

Here we go:

- first of all, you will be shown the basic principles and structure;
- then, you will be shown three case studies;
- finally, you'll receive instructions on how to modify and expand this library for your own purposes.
Loading

0 comments on commit 56ffd1e

Please sign in to comment.