Skip to content

Commit

Permalink
refactor: extract split tests and graphs (merge #23)
Browse files Browse the repository at this point in the history
- refactor: separate split tests
- refactor: extract utils from `knn`, `lda`, `lr`
- refactor: extract graphs and splits in svm and plsda
  • Loading branch information
f-aguzzi authored May 20, 2024
1 parent 0b13ed3 commit 6865c88
Show file tree
Hide file tree
Showing 15 changed files with 259 additions and 394 deletions.
2 changes: 1 addition & 1 deletion chemfusekit/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
'''__init__.py file for the library'''
from beartype import BeartypeConf
from beartype.claw import beartype_this_package
beartype_this_package(conf=BeartypeConf(violation_type=TypeError))
beartype_this_package(conf=BeartypeConf(violation_type=TypeError))
115 changes: 115 additions & 0 deletions chemfusekit/__utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
'''Utilities model: functions that are shared between different classes'''
from sklearn.cross_decomposition import PLSRegression as PLSR
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import numpy as np

import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

def graph_output(scores, model, name: str):
'''A reusable graphing function.'''
print(scores)
print(f"""
explained variance ratio with {name}:
{model.explained_variance_ratio_}
""")

# Display the explained variance ratio
print("Explained Variance Ratio:", model.explained_variance_ratio_)

#Scores plot
fig = px.scatter(scores, x="LV1", y="LV2", color="Substance", hover_data=['Substance'])
fig.update_xaxes(zeroline=True, zerolinewidth=1, zerolinecolor='Black')
fig.update_yaxes(zeroline=True, zerolinewidth=1, zerolinecolor='Black')
fig.update_layout(
height=600,
width=800,
title_text='Scores Plot')
fig.show()

# Plot 3D scores
fig = px.scatter_3d(scores, x='LV1', y='LV2', z='LV3',
color='Substance', hover_data=['Substance'],
hover_name=scores.index
)
fig.update_layout(title_text=f"3D colored by Substance for {name}")
fig.show()

def run_split_test(x, y, model, extended=False):
'''A function to run split tests on trained models.'''
x_train, x_test, y_train, y_test = train_test_split(
x,
y,
test_size=0.3,
random_state=42,
shuffle=True,
stratify=y
)

model.fit(x_train, y_train)
y_pred = model.predict(x_test)

# TODO: add something to print _x_scores multimodally

if extended:
# We can see the classes the model used
print(model.classes_)
# See the intercept of the model
print(model.intercept_)
# See the coefficients of the model - that can be easily interpreted
# (correlating or not with y)
print(model.coef_)

'''
Evaluate the model: each sample has a probability of belonging to Positive
or Negative outcome. Class 0 is Negative, class 1 is Positive. If the value
of the first column (probability of being Negative) is higher than 0.5, we
have a Negative sample. Otherwise, it will be Positive
'''
probabilities = model.predict_proba(x_train)
print(probabilities)

# This tells us the accuracy of our model in calibration
model.score(x_train, y_train)

predictions = model.predict(x_train)

print("Calibration predictions: ")
print(predictions)
print_confusion_matrix(y_train, predictions, "Confusion matrix based on training set")

if isinstance(model, PLSR):
y_pred = np.int8(np.abs(np.around(y_pred, decimals=0)))

print_confusion_matrix(y_test, y_pred, "Confusion matrix based on evaluation set")

# TODO: make multimodal
def print_confusion_matrix(y1, y2, title):
'''Function to simplify the plotting of confusion matrices'''
cm = confusion_matrix(y1, y2)

# Get unique class labels from y_true
class_labels = sorted(set(y2))

# Plot the confusion matrix using seaborn with custom colormap (Blues)
sns.heatmap(cm,
annot=True,
fmt='d',
cmap='Blues',
xticklabels=class_labels,
yticklabels=class_labels,
cbar=False,
vmin=0,
vmax=cm.max()
)

plt.xlabel('Predicted')
plt.ylabel('True')
plt.title(title)
plt.show()

# Print the classification report
print(classification_report(y1, y2, digits=2))
81 changes: 7 additions & 74 deletions chemfusekit/knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,11 @@
from beartype.typing import Callable

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from chemfusekit.lldf import LLDFModel
from chemfusekit.__utils import run_split_test, print_confusion_matrix

class KNNSettings:
'''Holds the settings for the kNN object.'''
Expand Down Expand Up @@ -50,10 +46,6 @@ def __init__(
class KNN:
'''Class to store the data, methods and artifacts for k-Nearest Neighbors Analysis'''
def __init__(self, settings: KNNSettings, fused_data: LLDFModel):
if type(settings) is not KNNSettings:
raise TypeError("Invalid settings: should be a KNNSettings-class object.")
if type(fused_data) is not LLDFModel:
raise TypeError("Invalid fused_data: shold be a LLDFModel-class object.")
self.settings = settings
self.fused_data = fused_data
self.model: Optional[KNeighborsClassifier] = None
Expand All @@ -77,80 +69,21 @@ def knn(self):
y_pred = knn.predict(self.fused_data.x_data)
print(y_pred)

# Assuming 'y_true' and 'y_pred' are your true and predicted labels
cm = confusion_matrix(self.fused_data.y, y_pred)

# Get unique class labels from y_true
class_labels = sorted(set(self.fused_data.y))

# Plot the confusion matrix using seaborn with custom colormap (Blues)
sns.heatmap(
cm,
annot=True,
fmt='d',
cmap='Blues',
xticklabels=class_labels,
yticklabels=class_labels,
cbar=False, vmin=0,
vmax=cm.max()
)

plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix based on the whole data set')
plt.show()

# Print the classification report
print(classification_report(self.fused_data.y, y_pred, digits=2))

if self.settings.test_split and self.settings.output:
# Split the data into a training set and a test set
x_train, x_test, y_train, y_test = train_test_split(
self.fused_data.x_data,
print_confusion_matrix(
self.fused_data.y,
test_size=0.3,
random_state=42
y_pred,
"Confusion Matrix based on the whole data set"
)

# Train the kNN model on the training section of the dataset
knn = KNeighborsClassifier(
if self.settings.test_split and self.settings.output:
knn_split = KNeighborsClassifier(
n_neighbors=self.settings.n_neighbors,
metric=self.settings.metric,
weights=self.settings.weights,
algorithm=self.settings.algorithm
)
knn.fit(x_train, y_train)

# View the prediction on the test data
y_pred = knn.predict(x_test)
print(y_pred)

# Assuming 'y_true' and 'y_pred' are your true and predicted labels
cm = confusion_matrix(y_test, y_pred)

# Get unique class labels from y_true
class_labels = sorted(set(y_test))

# Plot the confusion matrix using seaborn with custom colormap (Blues)
sns.heatmap(
cm,
annot=True,
fmt='d',
cmap='Blues',
xticklabels=class_labels,
yticklabels=class_labels,
cbar=False,
vmin=0,
vmax=cm.max()
)

plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix based on evaluation set')
plt.show()
run_split_test(self.fused_data.x_data, self.fused_data.y, knn_split)

# Print the classification report
print(classification_report(y_test, y_pred, digits=2))

def predict(self, x_data: pd.DataFrame):
'''Performs kNN prediction once the model is trained.'''
Expand Down
98 changes: 19 additions & 79 deletions chemfusekit/lda.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,23 @@

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LD
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, classification_report

import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

from chemfusekit.lldf import LLDFModel
from chemfusekit.__utils import graph_output, run_split_test

class LDASettings:
'''Holds the settings for the LDA object.'''
def __init__(self, components: int = 3, output: bool = False):
def __init__(self, components: int = 3, output: bool = False, test_split: bool = False):
if components <= 2:
raise ValueError("Invalid component number: must be a > 1 integer.")
if test_split is True and output is False:
raise Warning(
"You selected test_split but it won't run because you disabled the output."
)
self.components = components
self.output = output
self.test_split = test_split

class LDA:
'''Class to store the data, methods and artifacts for Linear Discriminant Analysis'''
Expand Down Expand Up @@ -56,81 +57,20 @@ def lda(self):

scores = pd.concat([scores, y_dataframe], axis = 1)

if self.settings.output:
print(scores)

print(f"""
explained variance ratio (three components) with LDA:
{lda.explained_variance_ratio_}
""")

# Display the explained variance ratio
print("Explained Variance Ratio:", lda.explained_variance_ratio_)

#Scores plot
fig = px.scatter(scores, x="LV1", y="LV2", color="Substance", hover_data=['Substance'])
fig.update_xaxes(zeroline=True, zerolinewidth=1, zerolinecolor='Black')
fig.update_yaxes(zeroline=True, zerolinewidth=1, zerolinecolor='Black')
fig.update_layout(
height=600,
width=800,
title_text='Scores Plot')
fig.show()

# Plot 3D scores
fig = px.scatter_3d(scores, x='LV1', y='LV2', z='LV3',
color='Substance', hover_data=['Substance'],
hover_name=scores.index
)
fig.update_layout(
title_text='3D colored by Substance for Linear Discriminant Analysis')
fig.show()

lda2 = LD(n_components=self.settings.components)

self.x_train, x_test, y_train, y_test = train_test_split(
(scores.drop('Substance', axis=1).values),
self.y,
test_size=0.3,
random_state=42
)

lda2.fit(self.x_train, y_train)
lda2.predict(x_test)
y_pred = lda2.predict(x_test)

if self.settings.output:
self.__print_prediction_graphs(y_test, y_pred)

# Store the traiend model
self.model = lda

def __print_prediction_graphs(self, y_test, y_pred):
'''Helper function to print graphs and stats about LDA predictions.'''
# Assuming 'y_test' and 'y_pred' are your true and predicted labels
cm = confusion_matrix(y_test, y_pred)

# Get unique class labels from y_true
class_labels = sorted(set(y_test))

# Plot the confusion matrix using seaborn with custom colormap (Blues)
sns.heatmap(cm,
annot=True,
fmt='d',
cmap='Blues',
xticklabels=class_labels,
yticklabels=class_labels,
cbar=False,
vmin=0,
vmax=cm.max()
)

plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix based on evaluation set')
plt.show()

# Print the classification report
print(classification_report(y_test, y_pred, digits=2))
# Show graphs if required by the user
if self.settings.output:
graph_output(scores, self.model, "Linear Discriminant Analysis")

# Run split tests if required by the user
if self.settings.test_split:
run_split_test(
(scores.drop('Substance', axis=1).values),
self.y,
LD(n_components=self.settings.components)
)

def predict(self, x_data: pd.DataFrame):
'''Performs LDA prediction once the model is trained.'''
Expand Down
Loading

0 comments on commit 6865c88

Please sign in to comment.