Skip to content

Commit

Permalink
Merge pull request #47 from f-aguzzi/pre/beta
Browse files Browse the repository at this point in the history
Pre/beta
  • Loading branch information
f-aguzzi authored Jun 29, 2024
2 parents 5573f1a + fa0c5b4 commit f5cf755
Show file tree
Hide file tree
Showing 13 changed files with 215 additions and 34 deletions.
26 changes: 26 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,29 @@
## [3.1.0-beta.3](https://github.com/f-aguzzi/tesi/compare/v3.1.0-beta.2...v3.1.0-beta.3) (2024-06-29)


### Features

* **DF:** add direct file blob import ([6ea358c](https://github.com/f-aguzzi/tesi/commit/6ea358c99b612fdd2bc79fb7de62cdb85bbf8d1c))

## [3.1.0-beta.2](https://github.com/f-aguzzi/tesi/compare/v3.1.0-beta.1...v3.1.0-beta.2) (2024-06-29)


### Features

* add support for BytesIO inputs ([bab82e8](https://github.com/f-aguzzi/tesi/commit/bab82e8cfc208062a66dee871f28ca32835c951d))

## [3.1.0-beta.1](https://github.com/f-aguzzi/tesi/compare/v3.0.0...v3.1.0-beta.1) (2024-06-29)


### Features

* **DF:** add file-like input capabilities ([e5c707b](https://github.com/f-aguzzi/tesi/commit/e5c707bd5897f363a831cb21ae7d81fb0699c3c7))


### Docs

* fix broken links in cookbook and documentation ([2353ba9](https://github.com/f-aguzzi/tesi/commit/2353ba928c155a9abf2ff7ba416440e9fe1668f9))

## [3.0.0](https://github.com/f-aguzzi/tesi/compare/v2.5.1...v3.0.0) (2024-06-27)


Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "chemfusekit"
version = "3.0.0"
version = "3.1.0b3"
description = "A minimal Python / Jupyter Notebook / Colab library for data fusion and chemometrical analysis."
authors = [
{ name = "Federico Aguzzi", email = "62149513+f-aguzzi@users.noreply.github.com" }
Expand Down
71 changes: 38 additions & 33 deletions src/chemfusekit/df.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Performs low-level data fusion on input arrays, outputs the results"""
from typing import Optional, List
from typing import Optional, List, IO
from io import BytesIO

import numpy as np
import pandas as pd
Expand All @@ -16,7 +17,7 @@
class Table:
"""Holds the path, preprocessing choice and sheet name for a single Excel table."""

def __init__(self, file_path: str, sheet_name: str, preprocessing: str, feature_selection: str | None = None,
def __init__(self, file_path: str | IO | BytesIO, sheet_name: str, preprocessing: str, feature_selection: str | None = None,
class_column: str = 'Substance', index_column: str | None = None):
self.file_path = file_path
self.sheet_name = sheet_name
Expand Down Expand Up @@ -133,21 +134,27 @@ def fuse(self):
ax1.set_title(f'Original data')
ax2.plot(wl, preprocessed_x.T)
ax2.set_title(f'Processed table with {table.preprocessing}')
if table.file_path.endswith('.xlsx'):
fig.suptitle(f'Imported table: {table.sheet_name} from {table.file_path}')
if isinstance(table.file_path, str):
if table.file_path.endswith('.xlsx'):
fig.suptitle(f'Imported table: {table.sheet_name} from {table.file_path}')
else:
fig.suptitle(f'Imported table: {table.file_path}')
else:
fig.suptitle(f'Imported table: {table.file_path}')
fig.suptitle(f'Imported table: ')
else:
# Let's plot the different datasets we preprocessed
fig, ax1 = plt.subplots(1, figsize=(15, 15))
if x.shape[1] == 1:
ax1.plot(x)
else:
ax1.plot(wl, x.T)
if table.file_path.endswith('.xlsx'):
fig.suptitle(f'Imported table: {table.sheet_name} from {table.file_path} (no preprocessing)')
if isinstance(table.file_path, str):
if table.file_path.endswith('.xlsx'):
fig.suptitle(f'Imported table: {table.sheet_name} from {table.file_path} (no preprocessing')
else:
fig.suptitle(f'Imported table: {table.file_path} (no preprocessing)')
else:
fig.suptitle(f'Imported table: {table.file_path} (no preprocessing)')
fig.suptitle(f'Imported table (no preprocessing):')

# Create a new DataFrame with the processed numerical attributes
processed_dataframe_x = pd.DataFrame(
Expand Down Expand Up @@ -230,31 +237,29 @@ def _perform_feature_selection(table: Table, data_model: BaseDataModel) -> BaseD

@staticmethod
def _import_table(file_path, sheet_name) -> pd.DataFrame:
"""Imports a table from a file"""
try:
# Autodetect the format based on the file extension
if file_path.endswith('.xlsx'):
table_data = pd.read_excel(
file_path,
sheet_name=sheet_name,
index_col=0,
header=0
)
elif file_path.endswith('.csv'):
table_data = pd.read_csv(
file_path,
index_col=0,
header=0
)
elif file_path.endswith('.json'):
table_data = pd.read_json(
file_path,
orient='table' # or other orientations based on your json format
)
else:
raise ValueError(f"Unsupported file format: {file_path}")
except Exception as exc:
raise FileNotFoundError("Error opening the selected files.") from exc
"""Imports a table from a file or file-like object"""
if isinstance(file_path, IO) or isinstance(file_path, BytesIO):
# Handle file-like objects
try:
table_data = pd.read_excel(file_path, sheet_name=sheet_name, index_col=0, header=0)
except Exception as exc:
raise ValueError("Error reading the file-like object.") from exc
elif isinstance(file_path, str):
# Handle file paths
try:
# Autodetect the format based on the file extension
if file_path.endswith('.xlsx'):
table_data = pd.read_excel(file_path, sheet_name=sheet_name, index_col=0, header=0)
elif file_path.endswith('.csv'):
table_data = pd.read_csv(file_path, index_col=0, header=0)
elif file_path.endswith('.json'):
table_data = pd.read_json(file_path, orient='table')
else:
raise ValueError(f"Unsupported file format: {file_path}")
except Exception as exc:
raise FileNotFoundError("Error opening the selected files.") from exc
else:
raise TypeError("Unsupported file type. Expected str or IO.")

return table_data

Expand Down
111 changes: 111 additions & 0 deletions thesis/4.4-case-study-GUI.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,117 @@ Uno degli obiettivi originali per la costruzione di ChemFuseKit, poi eliminato,

È stato scelto di impiegare la libreria Streamlit per la costruzione dell’interfaccia grafica in questo esempio. Si tratta di una libreria *open source* per trasformare script per l’analisi di dati in web app facilmente condivisibili, programmate in puro Python, senza che sia necessaria esperienza nel web development. Il design grafico, la disponibilità di componenti specifici e il loro aspetto estetico sono già definiti dalla libreria. Il caricamento di file è semplice, e permette di passare tabelle all’applicazione con estrema facilità. I componenti di Streamlit permettono la visualizzazione grafica di strutture dati basate su DataFrame della libreria Pandas, come quelle contenute in ChemFuseKit. Il processo di design dell’interfaccia è dunque puramente incentrato sull’ambito funzionale. Al programmatore è soltanto richiesto di decidere cosa inserire nella pagina, e come utilizzare i componenti grafici per invocare le funzionalità dei propri script o delle proprie librerie. Questo permette la prototipazione di web app complete con estrema rapidità, e rende la creazione dell’interfaccia un pensiero secondario in progetti, come questo, in cui la parte fondamentale riguarda l’elaborazione dei dati.

Il codice complessivo dell'applicazione, brevissimo e di facile lettura, è il seguente:

\tiny
```python
"""
# Data fusion
A graphical user interface for the data fusion module in `ChemFuseKit`.
"""

import streamlit as st
from chemfusekit.df import Table, DFSettings, DF
from io import BytesIO
import pandas as pd

st.title("ChemFuseKit data fusion module")

st.markdown(f"""
Use this web application to leverage the data fusion abilities of ChemFuseKit.
**Instructions:**
1. upload your table files
2. insert the settings for each table in the forms, and submit them one by one
3. select the fusion technique
4. click "Fuse tables"
5. download the resulting data
""")

tables = st.file_uploader(label="Upload your tables here", accept_multiple_files=True)

if "tabled_tables" not in st.session_state:
st.session_state.tabled_tables = []

for table in tables:
file = BytesIO(table.read())
with st.form(f"Form for table {table}"):
st.markdown(f"Import settings for: {table.name}")
if table.name.endswith(".xlsx"):
sheet_name = st.text_input("Sheet name: ")
else:
sheet_name = 'none'
preprocessing = st.selectbox(
"Preprocessing (SNV, Savitski-Golay, both or none)",
("snv", "savgol", "savgol+snv", "none"))
feature_selection = st.selectbox(
"Feature selection (PCA, PLSDA or none)",
("pca", "plsda", "none"))
class_column = st.text_input("Class column: ")
index_column = st.text_input("Index column: ")
submitted = st.form_submit_button("Submit")

if submitted:
st.session_state.tabled_tables.append(Table(
file_path=file,
sheet_name=sheet_name if sheet_name != '' else 'Sheet1',
preprocessing=preprocessing,
feature_selection=feature_selection if feature_selection != 'none' else None,
class_column=class_column if class_column != '' else 'Substance',
# index_column=index_column
))

if len(tables) > 0:
st.markdown(f"Imported {len(st.session_state.tabled_tables)} tables.")

if len(st.session_state.tabled_tables) > 0:
fusion_type = st.selectbox(
"Fusion technique: ",
("concat", "outer"))

if st.button("Fuse data"):
df = DF(DFSettings(output='none', method=fusion_type), st.session_state.tabled_tables)
df.fuse()
df.fused_data.x_train

buffer = BytesIO()
with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
# Write each dataframe to a different worksheet.
df.fused_data.x_train.to_excel(writer, sheet_name='Sheet1')

# Close the Pandas Excel writer and output the Excel file to the buffer
writer.close()

st.download_button(
label="Download fused data as Excel",
data=buffer,
file_name="fused_data.xlsx",
mime="application/vnd.ms-excel"
)
```

\normalsize
L'applicazione si avvia presentando una semplice lista delle istruzioni, e un bottone per caricare le tabelle.

![Schermata iniziale e caricamento delle tabelle](cs4-1.png)

Dopodiché viene reso possibile selezionare tramite una form le impostazioni per le singole tabelle da importare.

![Impostazioni per le singole tabelle](cs4-2.png)

Successivamente si possono scegliere le opzioni per la fusione, e avviarla.

![Impostazioni e avvio della fusione](cs4-3.png)

I risultati della fusione sono visualizzabili direttamente nell'applicazione.

![Risultati della fusione](cs4-4.png)

A fusione eseguita, è possibile scaricare il file XLSX contenente i dati fusi. Il contenuto del file mantiene la struttura visualizzata nell'anteprima all'interno dell'applicazione.

![Risultati della fusione, scaricati come file](cs4-5.png)

### Risultati

### Discussione
Expand Down
2 changes: 2 additions & 0 deletions thesis/998-ringraziamenti.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
\newpage

# Ringraziamenti

Ringraziamenti.
Expand Down
2 changes: 2 additions & 0 deletions thesis/999-bibliografia.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
\newpage

# Bibliografia

Bajoub, A., Medina-Rodríguez, S., Gómez-Romero, M., El Amine, A., Bagur-González, M. G., Fernández-Gutiérrez, A., Carrasco-Pancorbo, A. (2017). Assessing the varietal origin of extra-virgin olive oil using liquid chromatography fingerprints of phenolic compound, data fusion and chemometrics. *Food Chemistry*, 215, 245-255.
Expand Down
Binary file added thesis/cs4-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added thesis/cs4-2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added thesis/cs4-3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added thesis/cs4-4.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added thesis/cs4-5.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified thesis/use_cases.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
35 changes: 35 additions & 0 deletions thesis/use_cases.puml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
@startuml
rectangle "Libreria Python" {
(Imparare la chemiometria)
(Classificare i dati dalla scena del crimine)
(Fondere i dati)
(Importare ed esportare i dati)
(Creare grafici)
}

rectangle "Ambiente Colab" {
(Utilizzare la libreria da Colab)
}

rectangle "Applicazione grafica" {
(Utilizzare l'applicazione grafica)
}

(Aggiungere nuove funzionalità)

"Studente" --> (Imparare la chemiometria)
"Analista forense" --> (Classificare i dati dalla scena del crimine)
"Chiunque" --> (Fondere i dati)
"Chiunque" --> (Importare ed esportare i dati)
"Chiunque" --> (Creare grafici)
"Chiunque" -up-> (Utilizzare l'applicazione grafica)
"Utente di Colab" --> (Utilizzare la libreria da Colab)
"Gestore del sistema" -up-> (Aggiungere nuove funzionalità)
"Programmatore esterno" -down-> (Applicazione grafica) : <<implementa>>

"Ambiente Colab" ..|> "Libreria Python" : <<usa>>
"Applicazione grafica" ..|> "Libreria Python" : <<basato su>>

(Aggiungere nuove funzionalità) -up-|> "Ambiente Colab"
(Aggiungere nuove funzionalità) -up-|> "Libreria Python"
@enduml

0 comments on commit f5cf755

Please sign in to comment.