Skip to content

Commit

Permalink
Merge pull request #36 from f-aguzzi/pre/beta
Browse files Browse the repository at this point in the history
Pre/beta into main: release 2.0.0
  • Loading branch information
f-aguzzi authored Jun 4, 2024
2 parents 862ddf6 + 330ad59 commit 6896b35
Show file tree
Hide file tree
Showing 127 changed files with 2,298 additions and 295 deletions.
61 changes: 61 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,64 @@
## [2.0.0-beta.4](https://github.com/f-aguzzi/tesi/compare/v2.0.0-beta.3...v2.0.0-beta.4) (2024-06-04)


### Bug Fixes

* lda and lr_tests missing arguments ([280159d](https://github.com/f-aguzzi/tesi/commit/280159d8208f46a2a843e9eeae60d82114e15261))


### Docs

* new blog post ([afed9f7](https://github.com/f-aguzzi/tesi/commit/afed9f7620d06559892a517daef4f78192d3f3e2))

## [2.0.0-beta.3](https://github.com/f-aguzzi/tesi/compare/v2.0.0-beta.2...v2.0.0-beta.3) (2024-06-04)


### Features

* **LDA:** autodetect components from PCADataModel ([a59cd54](https://github.com/f-aguzzi/tesi/commit/a59cd545e9926de94117f2a46be5801c24271ba8))


### Docs

* fix wrong version number ([a0eb4e6](https://github.com/f-aguzzi/tesi/commit/a0eb4e6110dc25a5a8a4e6e72ff7ba02c05f6a14))
* update examples ([c919596](https://github.com/f-aguzzi/tesi/commit/c919596c94a7fd0c54548027d05c857c758054c9))


### Refactor

* moved prediction into base class ([57a3497](https://github.com/f-aguzzi/tesi/commit/57a349743964db553aa6cea425631022c37920b3))
* **lldf:** switch arguments in constructor call ([fcf7471](https://github.com/f-aguzzi/tesi/commit/fcf7471ba519ceed7747f48895421e932506b835))

## [2.0.0-beta.2](https://github.com/f-aguzzi/tesi/compare/v2.0.0-beta.1...v2.0.0-beta.2) (2024-06-04)


### Features

* made LR inherit from BaseClassifier ([d06a7db](https://github.com/f-aguzzi/tesi/commit/d06a7db270a99517a6445c94bdfacc1901e90121))


### Docs

* new version ([7c96050](https://github.com/f-aguzzi/tesi/commit/7c96050e20382fdd2312584dd0cf8ee091329181))

## [2.0.0-beta.1](https://github.com/f-aguzzi/tesi/compare/v1.2.0...v2.0.0-beta.1) (2024-06-04)


### ⚠ BREAKING CHANGES

* added base class for classifiers, models, settings

### Features

* added base class for classifiers, models, settings ([4af5d47](https://github.com/f-aguzzi/tesi/commit/4af5d4778d28021dcd2e23f00fc5810ae178769d))


### Docs

* fix broken github pages build ([862ddf6](https://github.com/f-aguzzi/tesi/commit/862ddf6557973229ec9b85830b677822db0f9da7))
* new cookbook section ([fd9a243](https://github.com/f-aguzzi/tesi/commit/fd9a2435469bdcf0457909ec3424ce1af5b118a9))
* updated docusaurus with versioning ([0b6d5c4](https://github.com/f-aguzzi/tesi/commit/0b6d5c4319f371a757ad0fc3a142e2eb1d959137)), closes [#33](https://github.com/f-aguzzi/tesi/issues/33)

## [1.2.0](https://github.com/f-aguzzi/tesi/compare/v1.1.3...v1.2.0) (2024-06-03)


Expand Down
52 changes: 52 additions & 0 deletions chemfusekit/__base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
'''A base class for all classifiers.'''

import pandas as pd
import numpy as np
import joblib
from .__utils import GraphMode


class BaseDataModel:
'''Models the output data from data-outputting operations'''
def __init__(self, x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray):
self.x_data = x_data
self.x_train = x_train
self.y = y


class BaseSettings:
'''Holds the settings for the BaseClassifier object.'''
def __init__(self, output: GraphMode = GraphMode.NONE, test_split: bool = False):
if test_split is True and output is GraphMode.NONE:
raise Warning(
"You selected test_split but it won't run because you disabled the output."
)
self.output = output
self.test_split = test_split


class BaseClassifier:
'''Parent class for all classifiers, containing basic shared utilities.'''
def __init__(self, settings: BaseSettings, data: BaseDataModel):
self.settings = settings
self.data = data
self.model = None

def import_model(self, import_path: str):
joblib.load(self.model, import_path)

def export_model(self, export_path: str):
if self.model is not None:
joblib.dump(self.model, export_path)
else:
raise RuntimeError("You haven't trained the model yet! You cannot export it now.")

def predict(self, x_data: pd.DataFrame):
'''Performs prediction once the model is trained.'''
if x_data is None:
raise TypeError(f"X data for {self.__class__.__name__} prediction must be non-empty.")
if self.model is None:
raise RuntimeError(f"The {self.__class__.__name__} model is not trained yet!")

y_pred = self.model.predict(x_data)
return y_pred
54 changes: 17 additions & 37 deletions chemfusekit/knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,18 @@

import pandas as pd

from chemfusekit.lldf import LLDFModel
from chemfusekit.lldf import LLDFDataModel
from chemfusekit.__utils import run_split_test, print_confusion_matrix, print_table, GraphMode
from .__base import BaseSettings, BaseClassifier

class KNNSettings:

class KNNSettings(BaseSettings):
'''Holds the settings for the kNN object.'''
def __init__(
self,
n_neighbors: int = 15,
metric: str | Callable = 'euclidean',
weights: str | Callable = 'uniform',
algorithm: str = 'auto',
output: GraphMode = GraphMode.NONE,
test_split: bool = False
):
def __init__(self, n_neighbors: int = 15, metric: str | Callable = 'euclidean', weights: str | Callable = 'uniform',
algorithm: str = 'auto', output: GraphMode = GraphMode.NONE, test_split: bool = False):

super().__init__(output, test_split)

if n_neighbors < 1:
raise ValueError("Invalid n_neighbors number: should be a positive integer.")
if metric not in ['minkwoski', 'precomputed', 'euclidean'] and not callable(metric):
Expand All @@ -32,23 +30,16 @@ def __init__(
raise ValueError(
"Invalid algorithm: should be 'auto', 'ball_tree', 'kd_tree' or 'brute'."
)
if test_split is True and output is GraphMode.NONE:
raise Warning(
"You selected test_split but it won't run because you disabled the output."
)
self.n_neighbors = n_neighbors
self.metric = metric
self.weights = weights
self.algorithm = algorithm
self.output = output
self.test_split = test_split

class KNN:

class KNN(BaseClassifier):
'''Class to store the data, methods and artifacts for k-Nearest Neighbors Analysis'''
def __init__(self, settings: KNNSettings, fused_data: LLDFModel):
self.settings = settings
self.fused_data = fused_data
self.model: Optional[KNeighborsClassifier] = None
def __init__(self, settings: KNNSettings, fused_data: LLDFDataModel):
super().__init__(settings, fused_data)

def knn(self):
'''Performs k-Nearest Neighbors Analysis'''
Expand All @@ -59,13 +50,13 @@ def knn(self):
weights=self.settings.weights,
algorithm=self.settings.algorithm
)
knn.fit(self.fused_data.x_data, self.fused_data.y)
knn.fit(self.data.x_data, self.data.y)

# Save the trained model
self.model = knn

# View the prediction on the test data
y_pred = knn.predict(self.fused_data.x_data)
y_pred = knn.predict(self.data.x_data)
print_table(
["Predictions"],
y_pred.reshape(1,len(y_pred)),
Expand All @@ -74,7 +65,7 @@ def knn(self):
)

print_confusion_matrix(
self.fused_data.y,
self.data.y,
y_pred,
"Confusion Matrix based on the whole data set",
self.settings.output
Expand All @@ -87,15 +78,4 @@ def knn(self):
weights=self.settings.weights,
algorithm=self.settings.algorithm
)
run_split_test(self.fused_data.x_data, self.fused_data.y, knn_split)


def predict(self, x_data: pd.DataFrame):
'''Performs kNN prediction once the model is trained.'''
if x_data is None:
raise TypeError("X data for kNN prediction must be non-empty.")
if self.model is None:
raise RuntimeError("The kNN model is not trained yet!")

y_pred = self.model.predict(x_data)
return y_pred
run_split_test(self.data.x_data, self.data.y, knn_split)
61 changes: 25 additions & 36 deletions chemfusekit/lda.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,39 +6,38 @@

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LD

from chemfusekit.lldf import LLDFModel
from chemfusekit.lldf import LLDFDataModel
from chemfusekit.__utils import graph_output, run_split_test
from chemfusekit.__utils import print_confusion_matrix, print_table, GraphMode
from .__base import BaseDataModel, BaseClassifier, BaseSettings
from .pca import PCADataModel

class LDASettings:

class LDASettings(BaseSettings):
'''Holds the settings for the LDA object.'''
def __init__(self, components: int = 3, output: GraphMode = GraphMode.NONE,
test_split: bool = False):
def __init__(self, components: int = 3, output: GraphMode = GraphMode.NONE, test_split: bool = False):
super().__init__(output, test_split)
if components <= 2:
raise ValueError("Invalid component number: must be a > 1 integer.")
if test_split is True and output is GraphMode.NONE:
raise Warning(
"You selected test_split but it won't run because you disabled the output."
)
self.components = components
self.output = output
self.test_split = test_split

class LDA:

class LDA(BaseClassifier):
'''Class to store the data, methods and artifacts for Linear Discriminant Analysis'''
def __init__(self, lldf_model: LLDFModel, settings: LDASettings):
def __init__(self, settings: LDASettings, data: BaseDataModel):
super().__init__(settings, data)
self.settings = settings
self.x_data = lldf_model.x_data
self.x_train = lldf_model.x_train
self.y = lldf_model.y
self.model: Optional[LD] = None
self.data = data
# Self-detect components if the data is from PCA
if isinstance(data, PCADataModel):
self.settings.components = data.components - 1

def lda(self):
'''Performs Linear Discriminant Analysis'''

lda = LD(n_components=self.settings.components) # N-1 where N are the classes
scores_lda = lda.fit(self.x_data, self.y).transform(self.x_data)
pred = lda.predict(self.x_data)
scores_lda = lda.fit(self.data.x_data, self.data.y).transform(self.data.x_data)
pred = lda.predict(self.data.x_data)

print_table(
[f"LV{i+1}" for i in range(scores_lda.shape[1])],
Expand Down Expand Up @@ -75,22 +74,22 @@ def lda(self):
self.settings.output
)

pred = lda.predict(self.x_data)
pred = lda.predict(self.data.x_data)
print_confusion_matrix(
y1=self.y,
y1=self.data.y,
y2=pred,
title="LDA Training Confusion Matrix",
mode=self.settings.output
)

lv_cols = [f'LV{i+1}' for i in range(self.settings.components)]
scores = pd.DataFrame(data = scores_lda, columns = lv_cols) # latent variables
scores.index = self.x_data.index
y_dataframe = pd.DataFrame(self.y, columns=['Substance'])
scores = pd.DataFrame(data=scores_lda, columns=lv_cols) # latent variables
scores.index = self.data.x_data.index
y_dataframe = pd.DataFrame(self.data.y, columns=['Substance'])

scores = pd.concat([scores, y_dataframe], axis = 1)

# Store the traiend model
# Store the trained model
self.model = lda

# Show graphs if required by the user
Expand All @@ -104,18 +103,8 @@ def lda(self):
# Run split tests if required by the user
if self.settings.test_split:
run_split_test(
(scores.drop('Substance', axis=1).values),
self.y,
scores.drop('Substance', axis=1).values,
self.data.y,
LD(n_components=self.settings.components),
mode=self.settings.output
)

def predict(self, x_data: pd.DataFrame):
'''Performs LDA prediction once the model is trained.'''
if x_data is None:
raise TypeError("X data for LDA prediction must be non-empty.")
if self.model is None:
raise RuntimeError("The LDA model is not trained yet!")

y_pred = self.model.predict(x_data)
return y_pred
18 changes: 8 additions & 10 deletions chemfusekit/lldf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import matplotlib
import matplotlib.pyplot as plt

from .__utils import GraphMode
from .__base import GraphMode, BaseDataModel


class Table:
Expand All @@ -19,12 +19,10 @@ def __init__(self, file_path: str, sheet_name: str, preprocessing: str):
self.preprocessing = preprocessing


class LLDFModel:
class LLDFDataModel(BaseDataModel):
'''Models the output data from the LLDF operation'''
def __init__(self, x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray):
self.x_data = x_data
self.x_train = x_train
self.y = y
super().__init__(x_data, x_train, y)


class LLDFSettings:
Expand All @@ -40,19 +38,19 @@ def _snv(input_data: np.ndarray):
for i in range(input_data.shape[0]):

# Apply correction
output_data[i,:] = (
(input_data[i,:] - np.mean(input_data[i,:])) / np.std(input_data[i,:])
output_data[i, :] = (
(input_data[i, :] - np.mean(input_data[i,:])) / np.std(input_data[i, :])
)

return output_data


class LLDF:
'''Holds together all the data, methods and artifacts of the LLDF operation'''
def __init__(self, tables: List[Table], settings: LLDFSettings):
def __init__(self, settings: LLDFSettings, tables: List[Table]):
self.settings = settings
self.tables = tables
self.fused_data: Optional[LLDFModel] = None
self.fused_data: Optional[LLDFDataModel] = None

def lldf(self):
'''Performs low-level data fusion'''
Expand Down Expand Up @@ -139,7 +137,7 @@ def lldf(self):
axis=1
)

self.fused_data = LLDFModel(x_data, x_train, y)
self.fused_data = LLDFDataModel(x_data, x_train, y)

def export_data(self, export_path: str):
'''Exports the data fusion artifacts to a file'''
Expand Down
Loading

0 comments on commit 6896b35

Please sign in to comment.