Skip to content

Commit

Permalink
feat!: added base class for classifiers, models, settings
Browse files Browse the repository at this point in the history
  • Loading branch information
f-aguzzi committed Jun 4, 2024
1 parent fd9a243 commit 4af5d47
Show file tree
Hide file tree
Showing 74 changed files with 1,514 additions and 161 deletions.
43 changes: 43 additions & 0 deletions chemfusekit/__base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
'''A base class for all classifiers.'''

import pandas as pd
import numpy as np
import joblib
from .__utils import GraphMode


class BaseDataModel:
'''Models the output data from data-outputting operations'''
def __init__(self, x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray):
self.x_data = x_data
self.x_train = x_train
self.y = y


class BaseSettings:
'''Holds the settings for the BaseClassifier object.'''
def __init__(self, output: GraphMode = GraphMode.NONE, test_split: bool = False):
if test_split is True and output is GraphMode.NONE:
raise Warning(
"You selected test_split but it won't run because you disabled the output."
)
self.output = output
self.test_split = test_split


class BaseClassifier:
'''Parent class for all classifiers, containing basic shared utilities.'''
def __init__(self, settings: BaseSettings, data: BaseDataModel):
self.settings = settings
self.data = data
self.model = None

def import_model(self, import_path: str):
joblib.load(self.model, import_path)

def export_model(self, export_path: str):
if self.model is not None:
joblib.dump(self.model, export_path)
else:
raise RuntimeError("You haven't trained the model yet! You cannot export it now.")

44 changes: 17 additions & 27 deletions chemfusekit/knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,18 @@

import pandas as pd

from chemfusekit.lldf import LLDFModel
from chemfusekit.lldf import LLDFDataModel
from chemfusekit.__utils import run_split_test, print_confusion_matrix, print_table, GraphMode
from .__base import BaseSettings, BaseClassifier

class KNNSettings:

class KNNSettings(BaseSettings):
'''Holds the settings for the kNN object.'''
def __init__(
self,
n_neighbors: int = 15,
metric: str | Callable = 'euclidean',
weights: str | Callable = 'uniform',
algorithm: str = 'auto',
output: GraphMode = GraphMode.NONE,
test_split: bool = False
):
def __init__(self, n_neighbors: int = 15, metric: str | Callable = 'euclidean', weights: str | Callable = 'uniform',
algorithm: str = 'auto', output: GraphMode = GraphMode.NONE, test_split: bool = False):

super().__init__(output, test_split)

if n_neighbors < 1:
raise ValueError("Invalid n_neighbors number: should be a positive integer.")
if metric not in ['minkwoski', 'precomputed', 'euclidean'] and not callable(metric):
Expand All @@ -32,23 +30,16 @@ def __init__(
raise ValueError(
"Invalid algorithm: should be 'auto', 'ball_tree', 'kd_tree' or 'brute'."
)
if test_split is True and output is GraphMode.NONE:
raise Warning(
"You selected test_split but it won't run because you disabled the output."
)
self.n_neighbors = n_neighbors
self.metric = metric
self.weights = weights
self.algorithm = algorithm
self.output = output
self.test_split = test_split

class KNN:

class KNN(BaseClassifier):
'''Class to store the data, methods and artifacts for k-Nearest Neighbors Analysis'''
def __init__(self, settings: KNNSettings, fused_data: LLDFModel):
self.settings = settings
self.fused_data = fused_data
self.model: Optional[KNeighborsClassifier] = None
def __init__(self, settings: KNNSettings, fused_data: LLDFDataModel):
super().__init__(settings, fused_data)

def knn(self):
'''Performs k-Nearest Neighbors Analysis'''
Expand All @@ -59,13 +50,13 @@ def knn(self):
weights=self.settings.weights,
algorithm=self.settings.algorithm
)
knn.fit(self.fused_data.x_data, self.fused_data.y)
knn.fit(self.data.x_data, self.data.y)

# Save the trained model
self.model = knn

# View the prediction on the test data
y_pred = knn.predict(self.fused_data.x_data)
y_pred = knn.predict(self.data.x_data)
print_table(
["Predictions"],
y_pred.reshape(1,len(y_pred)),
Expand All @@ -74,7 +65,7 @@ def knn(self):
)

print_confusion_matrix(
self.fused_data.y,
self.data.y,
y_pred,
"Confusion Matrix based on the whole data set",
self.settings.output
Expand All @@ -87,8 +78,7 @@ def knn(self):
weights=self.settings.weights,
algorithm=self.settings.algorithm
)
run_split_test(self.fused_data.x_data, self.fused_data.y, knn_split)

run_split_test(self.data.x_data, self.data.y, knn_split)

def predict(self, x_data: pd.DataFrame):
'''Performs kNN prediction once the model is trained.'''
Expand Down
29 changes: 13 additions & 16 deletions chemfusekit/lda.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,32 +6,29 @@

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LD

from chemfusekit.lldf import LLDFModel
from chemfusekit.lldf import LLDFDataModel
from chemfusekit.__utils import graph_output, run_split_test
from chemfusekit.__utils import print_confusion_matrix, print_table, GraphMode
from .__base import BaseDataModel, BaseClassifier, BaseSettings

class LDASettings:

class LDASettings(BaseSettings):
'''Holds the settings for the LDA object.'''
def __init__(self, components: int = 3, output: GraphMode = GraphMode.NONE,
test_split: bool = False):
def __init__(self, components: int = 3, output: GraphMode = GraphMode.NONE, test_split: bool = False):
super().__init__(output, test_split)
if components <= 2:
raise ValueError("Invalid component number: must be a > 1 integer.")
if test_split is True and output is GraphMode.NONE:
raise Warning(
"You selected test_split but it won't run because you disabled the output."
)
self.components = components
self.output = output
self.test_split = test_split

class LDA:

class LDA(BaseClassifier):
'''Class to store the data, methods and artifacts for Linear Discriminant Analysis'''
def __init__(self, lldf_model: LLDFModel, settings: LDASettings):
def __init__(self, settings: LDASettings, data_model: BaseDataModel):
super().__init__(settings, data_model)
self.settings = settings
self.x_data = lldf_model.x_data
self.x_train = lldf_model.x_train
self.y = lldf_model.y
self.model: Optional[LD] = None
self.x_data = data_model.x_data
self.x_train = data_model.x_train
self.y = data_model.y

def lda(self):
'''Performs Linear Discriminant Analysis'''
Expand Down
16 changes: 7 additions & 9 deletions chemfusekit/lldf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import matplotlib
import matplotlib.pyplot as plt

from .__utils import GraphMode
from .__base import GraphMode, BaseDataModel


class Table:
Expand All @@ -19,12 +19,10 @@ def __init__(self, file_path: str, sheet_name: str, preprocessing: str):
self.preprocessing = preprocessing


class LLDFModel:
class LLDFDataModel(BaseDataModel):
'''Models the output data from the LLDF operation'''
def __init__(self, x_data: pd.DataFrame, x_train: pd.DataFrame, y: np.ndarray):
self.x_data = x_data
self.x_train = x_train
self.y = y
super().__init__(x_data, x_train, y)


class LLDFSettings:
Expand All @@ -40,8 +38,8 @@ def _snv(input_data: np.ndarray):
for i in range(input_data.shape[0]):

# Apply correction
output_data[i,:] = (
(input_data[i,:] - np.mean(input_data[i,:])) / np.std(input_data[i,:])
output_data[i, :] = (
(input_data[i, :] - np.mean(input_data[i,:])) / np.std(input_data[i, :])
)

return output_data
Expand All @@ -52,7 +50,7 @@ class LLDF:
def __init__(self, tables: List[Table], settings: LLDFSettings):
self.settings = settings
self.tables = tables
self.fused_data: Optional[LLDFModel] = None
self.fused_data: Optional[LLDFDataModel] = None

def lldf(self):
'''Performs low-level data fusion'''
Expand Down Expand Up @@ -139,7 +137,7 @@ def lldf(self):
axis=1
)

self.fused_data = LLDFModel(x_data, x_train, y)
self.fused_data = LLDFDataModel(x_data, x_train, y)

def export_data(self, export_path: str):
'''Exports the data fusion artifacts to a file'''
Expand Down
13 changes: 4 additions & 9 deletions chemfusekit/lr.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
from sklearn.linear_model import LogisticRegression

from chemfusekit.__utils import run_split_test, print_confusion_matrix, print_table, GraphMode
from .__base import BaseSettings

class LRSettings:
class LRSettings(BaseSettings):
'''Holds the settings for the LR object.'''
def __init__(self, algorithm: str = 'liblinear', output: GraphMode = GraphMode.NONE,
test_split: bool = False):
def __init__(self, algorithm: str = 'liblinear', output: GraphMode = GraphMode.NONE, test_split: bool = False):
super().__init__(output, test_split)
if algorithm not in [
'lbfgs',
'liblinear',
Expand All @@ -21,13 +22,7 @@ def __init__(self, algorithm: str = 'liblinear', output: GraphMode = GraphMode.N
'saga'
]:
raise ValueError(f"{algorithm}: this algorithm does not exist.")
if test_split is True and output is GraphMode.NONE:
raise Warning(
"You selected test_split but it won't run because you disabled the output."
)
self.algorithm = algorithm
self.output = output
self.test_split = test_split


class LR:
Expand Down
18 changes: 9 additions & 9 deletions chemfusekit/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@

import scipy.stats

from chemfusekit.lldf import LLDFModel
from chemfusekit.__utils import print_table, GraphMode
from .__base import BaseDataModel


class PCASettings:
'''Holds the settings for the PCA object.'''
Expand All @@ -32,17 +33,18 @@ def __init__(self, target_variance: float = 0.95,

class PCA:
'''A class to store the data, methods and artifacts for Principal Component Analysis'''
def __init__(self, fused_data: LLDFModel, settings: PCASettings):
self.fused_data = fused_data
def __init__(self, settings: PCASettings, data: BaseDataModel):
self.data = data
self.components = 0
self.pca_model: Optional[PC] = None
self.settings = settings
self.array_scores: Optional[np.ndarray] = None

def pca(self):
'''Performs Principal Component Analysis.'''
# Read from the data fusion object
x_data = self.fused_data.x_data

# Read from the data fusion object
x_data = self.data.x_data

# Run PCA producing the reduced variable Xreg and select the first 10 components
pca = PC(self.settings.initial_components)
Expand Down Expand Up @@ -100,11 +102,10 @@ def pca(self):
self.pca_model = pca
self.pca_model.fit_transform(x_data)


def pca_stats(self):
'''Produces PCA-related statistics.'''
x_data = self.fused_data.x_data
x_train = self.fused_data.x_train
x_data = self.data.x_data
x_train = self.data.x_train

# Prepare the Scores dataframe (and concatenate the original 'Region' variable)
pc_cols = [f"PC{i+1}" for i in range(self.components)]
Expand Down Expand Up @@ -246,7 +247,6 @@ def mean_confidence_interval(data, confidence=self.settings.confidence_level):
)
fig_normalized.show()


# Assuming 'scores' is your DataFrame with the 'class' column
# Drop the 'class' column before converting to NumPy array
array_scores = scores.drop('Substance', axis=1).values
Expand Down
Loading

0 comments on commit 4af5d47

Please sign in to comment.