Merge pull request #38 from f-aguzzi/pre/beta

Pre/beta into main: release 2.1.0
f-aguzzi · Jun 7, 2024 · 56ffd1e · 56ffd1e
2 parents 3fdd0b8 + 3601750
commit 56ffd1e
Show file tree

Hide file tree

Showing 60 changed files with 1,723 additions and 16 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,15 @@
+## [2.1.0-beta.1](https://github.com/f-aguzzi/tesi/compare/v2.0.0...v2.1.0-beta.1) (2024-06-05)
+
+
+### Features
+
+* **import/export:** fully functional file dumping ([e1d0044](https://github.com/f-aguzzi/tesi/commit/e1d004448afd86f4ffa2ed4b87629e6798ef41b2))
+
+
+### chore
+
+* **license:** add GPLv3 license ([3fdd0b8](https://github.com/f-aguzzi/tesi/commit/3fdd0b87b6587b7413dd36f5101d37a5d712e7d7))
+
 ## [2.0.0](https://github.com/f-aguzzi/tesi/compare/v1.2.0...v2.0.0) (2024-06-04)
 
 

diff --git a/chemfusekit/__base.py b/chemfusekit/__base.py
@@ -3,19 +3,56 @@
 import pandas as pd
 import numpy as np
 import joblib
+
+from sklearn.base import BaseEstimator
+
 from .__utils import GraphMode
 
 
 class BaseDataModel:
     '''Models the output data from data-outputting operations'''
+
     def __init__(self, x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray):
         self.x_data = x_data
         self.x_train = x_train
         self.y = y
 
+    @classmethod
+    def load_from_file(cls, import_path: str, sheet_name: str = 'Sheet1'):
+        try:
+            table_data = pd.read_excel(
+                import_path,
+                sheet_name=sheet_name,
+                index_col=0,
+                header=0
+            )
+        except Exception as exc:
+            raise FileNotFoundError("Error opening the selected files.") from exc
+
+        x = table_data.iloc[:, 1:]
+
+        # It is necessary to convert the column names as string to select them
+        x.columns = x.columns.astype(str)  # to make the colnames as text
+
+        y = table_data.loc[:, 'Substance'].values
+        y_dataframe = pd.DataFrame(y, columns=['Substance'])
+        x_train = pd.concat(
+            [y_dataframe, x],
+            axis=1
+        )
+
+        return cls(x, x_train, y)
+
+    def export_to_file(self, export_path: str, sheet_name: str = 'Sheet1'):
+        try:
+            self.x_train.to_excel(excel_writer=export_path, sheet_name=sheet_name)
+        except Exception as exc:
+            raise RuntimeError("Could not export data to the selected path.") from exc
+
 
 class BaseSettings:
     '''Holds the settings for the BaseClassifier object.'''
+
     def __init__(self, output: GraphMode = GraphMode.NONE, test_split: bool = False):
         if test_split is True and output is GraphMode.NONE:
             raise Warning(
@@ -27,13 +64,17 @@ def __init__(self, output: GraphMode = GraphMode.NONE, test_split: bool = False)
 
 class BaseClassifier:
     '''Parent class for all classifiers, containing basic shared utilities.'''
+
     def __init__(self, settings: BaseSettings, data: BaseDataModel):
         self.settings = settings
         self.data = data
-        self.model = None
+        self.model: BaseEstimator | None = None
 
     def import_model(self, import_path: str):
-        joblib.load(self.model, import_path)
+        model = joblib.load(import_path)
+        if not isinstance(model, BaseEstimator):
+            raise ImportError("The file you tried importing is not a sklearn model!")
+        self.model = model
 
     def export_model(self, export_path: str):
         if self.model is not None:

diff --git a/chemfusekit/knn.py b/chemfusekit/knn.py
@@ -1,4 +1,5 @@
 '''k-Nearest Neighbors Analysis module'''
+from copy import copy
 from typing import Optional
 from beartype.typing import Callable
 
@@ -79,3 +80,14 @@ def knn(self):
                 algorithm=self.settings.algorithm
             )
             run_split_test(self.data.x_data, self.data.y, knn_split)
+
+    def import_model(self, import_path: str):
+        model_backup = copy(self.model)
+        super().import_model(import_path)
+        if not isinstance(self.model, KNeighborsClassifier):
+            self.model = model_backup
+            raise ImportError("The file you tried to import is not a KNeighborsClassifier.")
+        self.settings.n_neighbors = self.model.n_neighbors
+        self.settings.metric = self.model.metric
+        self.settings.weights = self.model.weights
+        self.settings.algorithm = self.model.algorithm
diff --git a/chemfusekit/lda.py b/chemfusekit/lda.py
@@ -1,6 +1,8 @@
 '''Linear Discriminant Analysis module'''
+from copy import copy
 from typing import Optional
 
+import joblib
 import numpy as np
 import pandas as pd
 
@@ -108,3 +110,11 @@ def lda(self):
                 LD(n_components=self.settings.components),
                 mode=self.settings.output
             )
+
+    def import_model(self, import_path: str):
+        model_backup = copy(self.model)
+        super().import_model(import_path)
+        if not isinstance(self.model, LD):
+            self.model = model_backup
+            raise ImportError("The file you tried to import is not a LinearDiscriminantAnalysis classifier.")
+        self.settings.components = self.model.n_components
diff --git a/chemfusekit/lldf.py b/chemfusekit/lldf.py
@@ -139,14 +139,9 @@ def lldf(self):
 
         self.fused_data = LLDFDataModel(x_data, x_train, y)
 
-    def export_data(self, export_path: str):
+    def export_data(self, export_path: str, sheet_name: str = 'Sheet1'):
         '''Exports the data fusion artifacts to a file'''
         if self.fused_data is None:
             raise RuntimeError("Cannot export data before data fusion.")
 
-        x_train_dataframe = pd.DataFrame(self.fused_data.x_train)
-
-        try:
-            x_train_dataframe.to_excel(export_path)
-        except Exception as exc:
-            raise RuntimeError("Could not export data to the selected path.") from exc
+        self.fused_data.export_to_file(export_path=export_path, sheet_name=sheet_name)
diff --git a/chemfusekit/lr.py b/chemfusekit/lr.py
@@ -1,4 +1,5 @@
 '''Logistic Regression Module'''
+from copy import copy
 from typing import Optional
 
 import numpy as np
@@ -128,3 +129,11 @@ def predict(self, x_sample: pd.DataFrame):
         )
 
         return prediction
+
+    def import_model(self, import_path: str):
+        model_backup = copy(self.model)
+        super().import_model(import_path)
+        if not isinstance(self.model, LogisticRegression):
+            self.model = model_backup
+            raise ImportError("The file you tried to import is not a LogisticRegression classifier.")
+        self.settings.algorithm = self.model.solver
diff --git a/chemfusekit/plsda.py b/chemfusekit/plsda.py
@@ -133,3 +133,11 @@ def plsda(self):
             x = self.data.x_data
             y = self.data.x_train.Substance.astype('category').cat.codes
             run_split_test(x, y, PLSR(self.settings.n_components), mode=self.settings.output)
+
+    def import_model(self, import_path: str):
+        model_backup = copy(self.model)
+        super().import_model(import_path)
+        if not isinstance(self.model, PLSR):
+            self.model = model_backup
+            raise ImportError("The file you tried to import is not a PLSRegression classifier.")
+        self.settings.n_components = self.model.n_components
diff --git a/chemfusekit/svm.py b/chemfusekit/svm.py
@@ -1,4 +1,5 @@
 '''Support Vector Machine module.'''
+from copy import copy
 from typing import Optional
 
 import pandas as pd
@@ -60,3 +61,11 @@ def svm(self):
                 model=SVC(kernel=self.settings.kernel),
                 mode=self.settings.output
             )
+
+    def import_model(self, import_path: str):
+        model_backup = copy(self.model)
+        super().import_model(import_path)
+        if not isinstance(self.model, SVC):
+            self.model = model_backup
+            raise ImportError("The file you tried to import is not an SVC classifier.")
+        self.settings.kernel = self.model.kernel
diff --git a/docs/cookbook/structure.md b/docs/cookbook/structure.md
@@ -47,6 +47,36 @@ As you can see, each module contains a class with the same name of the module, a
 
 ## Modular design features
 
+The entire library was streamlined to make operations as smooth and easy as possible. Any operation (import and export of both data and classifier models, training, processing, prediction, ...) looks the same on any class.
+
+<br />
+
+> *Want to update the settings in a classifier?*
+
+You'll find the settings for `LDA` in `LDA.settings`. And the settings of `PCA` in `PCA.settings`. Where are the settings for `SVM`? In `SVM.settings`, of course. You get the hang of it.
+
+<br />
+
+> *Want to inspect the underlying `sklearn` model in one of the classifiers?
+
+Let's say you're using a `LR` object. Its underlying sklearn classifier is in `LR.model`, as much as the underlying sklearn classifier of `KNN` is in `KNN.model`.
+
+<br />
+
+> *Want to swap out the data in a model and retrain it?*
+
+Let's assume your new data is called `new_data`. Knowing that the training data, when present, is located in the `.data` field, just do this:
+
+```python
+knn.data = new_data
+knn.knn()
+```
+
+The training method is always called like its container class, but in lower case. To train a `KNN` model, like in this case, you just have to call `.knn()` on it. Same goes for `.lda()` on `LDA`, `.lldf()` on `LLDF`, and so on.
+
+
+### Modular settings
+
 The settings for all classifiers (that is, all classes except `LLDF` and `PCA`) inherit from a base class called [`BaseSettings`](/docs/base/basesettings) in the `base` module:
 
 ```mermaid
@@ -84,8 +114,9 @@ classDiagram
     BaseSettings *-- SVMSettings 
 ```
 
-\
-\
+
+### Modular classifiers
+
 The classifiers themselves all inherit from a base class called [`BaseClassifier`](/docs/base/baseclassifier) in the `base` module:
 
 ```mermaid
@@ -128,8 +159,9 @@ classDiagram
     BaseClassifier *-- SVM
 ```
 
-\
-\
+
+### Modular data types
+
 The data types are modular and interexchangeable too. Both [`LLDFDataModel`](/docs/lldf/lldfmodel) and [`PCADataModel`](/docs/pca/pcadatamodel) inherit from [`BaseDataModel`](/docs/base/basedatamodel) as shown in the following diagram:
 
 ```mermaid
@@ -156,4 +188,10 @@ classDiagram
     BaseDataModel *-- PCADataModel
 ```
 
-This allows all the classifiers to use the `LLDF` data, dimension-reduced `PCA` data, or any other type of data as long as it follows the `BaseDataModel` template.
+This allows all the classifiers to use the `LLDF` data, dimension-reduced `PCA` data, or any other type of data as long as it follows the `BaseDataModel` template.
+
+## File import and export
+
+All the data models (`BaseDataModel`, and its derived, `LLDFDataModel` and `PCADataModel`) can export their content to Excel tables.
+
+All classifiers derived from `BaseClassifier` (`KNN`, `LDA`, `LR`, `PLSDA`, `SVM`) can import and export their sklearn data model from and to file.
diff --git a/docs/cookbook_versioned_docs/version-2.1.0/case-study-classifier.md b/docs/cookbook_versioned_docs/version-2.1.0/case-study-classifier.md
@@ -0,0 +1,9 @@
+---
+sidebar_position: 3
+---
+
+# Case study: training a classifier from lab data
+
+:::note
+This case study is still **under construction**.
+:::
diff --git a/docs/cookbook_versioned_docs/version-2.1.0/case-study-hybrid.md b/docs/cookbook_versioned_docs/version-2.1.0/case-study-hybrid.md
@@ -0,0 +1,9 @@
+---
+sidebar_position: 4
+---
+
+# Case study: hybrid workflow
+
+:::note
+This case study is still **under construction**.
+:::
diff --git a/docs/cookbook_versioned_docs/version-2.1.0/case-study-realtime.md b/docs/cookbook_versioned_docs/version-2.1.0/case-study-realtime.md
@@ -0,0 +1,9 @@
+---
+sidebar_position: 5
+---
+
+# Case study: real-time data classification
+
+:::note
+This case study is still **under construction**.
+:::
diff --git a/docs/cookbook_versioned_docs/version-2.1.0/introduction.md b/docs/cookbook_versioned_docs/version-2.1.0/introduction.md
@@ -0,0 +1,26 @@
+---
+sidebar_position: 1
+---
+
+# The ChemFuseKit Cookbook: an introduction
+
+*What is a cookbook, exactly?*
+
+> A cookbook is a comprehensive collection of recipes that guide users through
+the process of learning and mastering the use of a specific library or
+programming technique, by providing step-by-step instructions, explanations and
+examples.
+
+## What you'll learn
+
+In this cookbook you will learn the basic principles of operation of `ChemFuseKit` through practical examples and case studies. You will be shown that all modules follow a basic structure, and once you've learned it for one module, you will be able to reapply that knowledge for all modules.
+
+You will be shown how to use the library on its own, and also how to use it as a part of a bigger pipeline.
+
+## Cookbook sectioning
+
+Here we go:
+
+- first of all, you will be shown the basic principles and structure;
+- then, you will be shown three case studies;
+- finally, you'll receive instructions on how to modify and expand this library for your own purposes.