Merge pull request #47 from f-aguzzi/pre/beta

Pre/beta
f-aguzzi · Jun 29, 2024 · f5cf755 · f5cf755
2 parents 5573f1a + fa0c5b4
commit f5cf755
Show file tree

Hide file tree

Showing 13 changed files with 215 additions and 34 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,29 @@
+## [3.1.0-beta.3](https://github.com/f-aguzzi/tesi/compare/v3.1.0-beta.2...v3.1.0-beta.3) (2024-06-29)
+
+
+### Features
+
+* **DF:** add direct file blob import ([6ea358c](https://github.com/f-aguzzi/tesi/commit/6ea358c99b612fdd2bc79fb7de62cdb85bbf8d1c))
+
+## [3.1.0-beta.2](https://github.com/f-aguzzi/tesi/compare/v3.1.0-beta.1...v3.1.0-beta.2) (2024-06-29)
+
+
+### Features
+
+* add support for BytesIO inputs ([bab82e8](https://github.com/f-aguzzi/tesi/commit/bab82e8cfc208062a66dee871f28ca32835c951d))
+
+## [3.1.0-beta.1](https://github.com/f-aguzzi/tesi/compare/v3.0.0...v3.1.0-beta.1) (2024-06-29)
+
+
+### Features
+
+* **DF:** add file-like input capabilities ([e5c707b](https://github.com/f-aguzzi/tesi/commit/e5c707bd5897f363a831cb21ae7d81fb0699c3c7))
+
+
+### Docs
+
+* fix broken links in cookbook and documentation ([2353ba9](https://github.com/f-aguzzi/tesi/commit/2353ba928c155a9abf2ff7ba416440e9fe1668f9))
+
 ## [3.0.0](https://github.com/f-aguzzi/tesi/compare/v2.5.1...v3.0.0) (2024-06-27)
 
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "chemfusekit"
-version = "3.0.0"
+version = "3.1.0b3"
 description = "A minimal Python / Jupyter Notebook / Colab library for data fusion and chemometrical analysis."
 authors = [
     { name = "Federico Aguzzi", email = "62149513+f-aguzzi@users.noreply.github.com" }

diff --git a/src/chemfusekit/df.py b/src/chemfusekit/df.py
@@ -1,5 +1,6 @@
 """Performs low-level data fusion on input arrays, outputs the results"""
-from typing import Optional, List
+from typing import Optional, List, IO
+from io import BytesIO
 
 import numpy as np
 import pandas as pd
@@ -16,7 +17,7 @@
 class Table:
     """Holds the path, preprocessing choice and sheet name for a single Excel table."""
 
-    def __init__(self, file_path: str, sheet_name: str, preprocessing: str, feature_selection: str | None = None,
+    def __init__(self, file_path: str | IO | BytesIO, sheet_name: str, preprocessing: str, feature_selection: str | None = None,
                  class_column: str = 'Substance', index_column: str | None = None):
         self.file_path = file_path
         self.sheet_name = sheet_name
@@ -133,21 +134,27 @@ def fuse(self):
                     ax1.set_title(f'Original data')
                     ax2.plot(wl, preprocessed_x.T)
                     ax2.set_title(f'Processed table with {table.preprocessing}')
-                    if table.file_path.endswith('.xlsx'):
-                        fig.suptitle(f'Imported table: {table.sheet_name} from {table.file_path}')
+                    if isinstance(table.file_path, str):
+                        if table.file_path.endswith('.xlsx'):
+                            fig.suptitle(f'Imported table: {table.sheet_name} from {table.file_path}')
+                        else:
+                            fig.suptitle(f'Imported table: {table.file_path}')
                     else:
-                        fig.suptitle(f'Imported table: {table.file_path}')
+                        fig.suptitle(f'Imported table: ')
                 else:
                     # Let's plot the different datasets we preprocessed
                     fig, ax1 = plt.subplots(1, figsize=(15, 15))
                     if x.shape[1] == 1:
                         ax1.plot(x)
                     else:
                         ax1.plot(wl, x.T)
-                    if table.file_path.endswith('.xlsx'):
-                        fig.suptitle(f'Imported table: {table.sheet_name} from {table.file_path} (no preprocessing)')
+                    if isinstance(table.file_path, str):
+                        if table.file_path.endswith('.xlsx'):
+                            fig.suptitle(f'Imported table: {table.sheet_name} from {table.file_path} (no preprocessing')
+                        else:
+                            fig.suptitle(f'Imported table: {table.file_path}  (no preprocessing)')
                     else:
-                        fig.suptitle(f'Imported table: {table.file_path} (no preprocessing)')
+                        fig.suptitle(f'Imported table (no preprocessing):')
 
             # Create a new DataFrame with the processed numerical attributes
             processed_dataframe_x = pd.DataFrame(
@@ -230,31 +237,29 @@ def _perform_feature_selection(table: Table, data_model: BaseDataModel) -> BaseD
 
     @staticmethod
     def _import_table(file_path, sheet_name) -> pd.DataFrame:
-        """Imports a table from a file"""
-        try:
-            # Autodetect the format based on the file extension
-            if file_path.endswith('.xlsx'):
-                table_data = pd.read_excel(
-                    file_path,
-                    sheet_name=sheet_name,
-                    index_col=0,
-                    header=0
-                )
-            elif file_path.endswith('.csv'):
-                table_data = pd.read_csv(
-                    file_path,
-                    index_col=0,
-                    header=0
-                )
-            elif file_path.endswith('.json'):
-                table_data = pd.read_json(
-                    file_path,
-                    orient='table'  # or other orientations based on your json format
-                )
-            else:
-                raise ValueError(f"Unsupported file format: {file_path}")
-        except Exception as exc:
-            raise FileNotFoundError("Error opening the selected files.") from exc
+        """Imports a table from a file or file-like object"""
+        if isinstance(file_path, IO) or isinstance(file_path, BytesIO):
+            # Handle file-like objects
+            try:
+                table_data = pd.read_excel(file_path, sheet_name=sheet_name, index_col=0, header=0)
+            except Exception as exc:
+                raise ValueError("Error reading the file-like object.") from exc
+        elif isinstance(file_path, str):
+            # Handle file paths
+            try:
+                # Autodetect the format based on the file extension
+                if file_path.endswith('.xlsx'):
+                    table_data = pd.read_excel(file_path, sheet_name=sheet_name, index_col=0, header=0)
+                elif file_path.endswith('.csv'):
+                    table_data = pd.read_csv(file_path, index_col=0, header=0)
+                elif file_path.endswith('.json'):
+                    table_data = pd.read_json(file_path, orient='table')
+                else:
+                    raise ValueError(f"Unsupported file format: {file_path}")
+            except Exception as exc:
+                raise FileNotFoundError("Error opening the selected files.") from exc
+        else:
+            raise TypeError("Unsupported file type. Expected str or IO.")
 
         return table_data
 

diff --git a/thesis/4.4-case-study-GUI.md b/thesis/4.4-case-study-GUI.md
@@ -8,6 +8,117 @@ Uno degli obiettivi originali per la costruzione di ChemFuseKit, poi eliminato,
 
 È stato scelto di impiegare la libreria Streamlit per la costruzione dell’interfaccia grafica in questo esempio. Si tratta di una libreria *open source* per trasformare script per l’analisi di dati in web app facilmente condivisibili, programmate in puro Python, senza che sia necessaria esperienza nel web development. Il design grafico, la disponibilità di componenti specifici e il loro aspetto estetico sono già definiti dalla libreria. Il caricamento di file è semplice, e permette di passare tabelle all’applicazione con estrema facilità. I componenti di Streamlit permettono la visualizzazione grafica di strutture dati basate su DataFrame della libreria Pandas, come quelle contenute in ChemFuseKit. Il processo di design dell’interfaccia è dunque puramente incentrato sull’ambito funzionale. Al programmatore è soltanto richiesto di decidere cosa inserire nella pagina, e come utilizzare i componenti grafici per invocare le funzionalità dei propri script o delle proprie librerie. Questo permette la prototipazione di web app complete con estrema rapidità, e rende la creazione dell’interfaccia un pensiero secondario in progetti, come questo, in cui la parte fondamentale riguarda l’elaborazione dei dati.
 
+Il codice complessivo dell'applicazione, brevissimo e di facile lettura, è il seguente:
+
+\tiny
+```python
+"""
+# Data fusion
+
+A graphical user interface for the data fusion module in `ChemFuseKit`.
+"""
+
+import streamlit as st
+from chemfusekit.df import Table, DFSettings, DF
+from io import BytesIO
+import pandas as pd
+
+st.title("ChemFuseKit data fusion module")
+
+st.markdown(f"""
+Use this web application to leverage the data fusion abilities of ChemFuseKit.
+
+**Instructions:**
+1. upload your table files
+2. insert the settings for each table in the forms, and submit them one by one
+3. select the fusion technique
+4. click "Fuse tables"
+5. download the resulting data
+""")
+
+tables = st.file_uploader(label="Upload your tables here", accept_multiple_files=True)
+
+if "tabled_tables" not in st.session_state:
+    st.session_state.tabled_tables = []
+
+for table in tables:
+    file = BytesIO(table.read())
+    with st.form(f"Form for table {table}"):
+        st.markdown(f"Import settings for: {table.name}")
+        if table.name.endswith(".xlsx"):
+            sheet_name = st.text_input("Sheet name: ")
+        else:
+            sheet_name = 'none'
+        preprocessing = st.selectbox(
+            "Preprocessing (SNV, Savitski-Golay, both or none)",
+            ("snv", "savgol", "savgol+snv", "none"))
+        feature_selection = st.selectbox(
+            "Feature selection (PCA, PLSDA or none)",
+            ("pca", "plsda", "none"))
+        class_column = st.text_input("Class column: ")
+        index_column = st.text_input("Index column: ")
+        submitted = st.form_submit_button("Submit")
+
+        if submitted:
+            st.session_state.tabled_tables.append(Table(
+                file_path=file,
+                sheet_name=sheet_name if sheet_name != '' else 'Sheet1',
+                preprocessing=preprocessing,
+                feature_selection=feature_selection if feature_selection != 'none' else None,
+                class_column=class_column if class_column != '' else 'Substance',
+                # index_column=index_column
+            ))
+
+if len(tables) > 0:
+    st.markdown(f"Imported {len(st.session_state.tabled_tables)} tables.")
+
+if len(st.session_state.tabled_tables) > 0:
+    fusion_type = st.selectbox(
+            "Fusion technique: ",
+            ("concat", "outer"))
+
+    if st.button("Fuse data"):
+        df = DF(DFSettings(output='none', method=fusion_type), st.session_state.tabled_tables)
+        df.fuse()
+        df.fused_data.x_train
+
+        buffer = BytesIO()
+        with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
+            # Write each dataframe to a different worksheet.
+            df.fused_data.x_train.to_excel(writer, sheet_name='Sheet1')
+
+            # Close the Pandas Excel writer and output the Excel file to the buffer
+            writer.close()
+
+            st.download_button(
+                label="Download fused data as Excel",
+                data=buffer,
+                file_name="fused_data.xlsx",
+                mime="application/vnd.ms-excel"
+            )
+```
+
+\normalsize
+L'applicazione si avvia presentando una semplice lista delle istruzioni, e un bottone per caricare le tabelle.
+
+![Schermata iniziale e caricamento delle tabelle](cs4-1.png)
+
+Dopodiché viene reso possibile selezionare tramite una form le impostazioni per le singole tabelle da importare.
+
+![Impostazioni per le singole tabelle](cs4-2.png)
+
+Successivamente si possono scegliere le opzioni per la fusione, e avviarla.
+
+![Impostazioni e avvio della fusione](cs4-3.png)
+
+I risultati della fusione sono visualizzabili direttamente nell'applicazione.
+
+![Risultati della fusione](cs4-4.png)
+
+A fusione eseguita, è possibile scaricare il file XLSX contenente i dati fusi. Il contenuto del file mantiene la struttura visualizzata nell'anteprima all'interno dell'applicazione.
+
+![Risultati della fusione, scaricati come file](cs4-5.png)
+
 ### Risultati
 
 ### Discussione

diff --git a/thesis/998-ringraziamenti.md b/thesis/998-ringraziamenti.md
@@ -1,3 +1,5 @@
+\newpage
+
 # Ringraziamenti
 
 Ringraziamenti.

diff --git a/thesis/999-bibliografia.md b/thesis/999-bibliografia.md
@@ -1,3 +1,5 @@
+\newpage
+
 # Bibliografia
 
 Bajoub, A., Medina-Rodríguez, S., Gómez-Romero, M., El Amine, A., Bagur-González, M. G., Fernández-Gutiérrez, A., Carrasco-Pancorbo, A. (2017). Assessing the varietal origin of extra-virgin olive oil using liquid chromatography fingerprints of phenolic compound, data fusion and chemometrics. *Food Chemistry*, 215, 245-255.

diff --git a/thesis/cs4-1.png b/thesis/cs4-1.png
diff --git a/thesis/cs4-2.png b/thesis/cs4-2.png
diff --git a/thesis/cs4-3.png b/thesis/cs4-3.png
diff --git a/thesis/cs4-4.png b/thesis/cs4-4.png
diff --git a/thesis/cs4-5.png b/thesis/cs4-5.png
diff --git a/thesis/use_cases.png b/thesis/use_cases.png
diff --git a/thesis/use_cases.puml b/thesis/use_cases.puml
@@ -0,0 +1,35 @@
+@startuml
+rectangle "Libreria Python" {
+	(Imparare la chemiometria)
+	(Classificare i dati dalla scena del crimine)
+	(Fondere i dati)
+	(Importare ed esportare i dati)
+	(Creare grafici)
+}
+
+rectangle "Ambiente Colab" {
+	(Utilizzare la libreria da Colab)
+}
+
+rectangle "Applicazione grafica" {
+	(Utilizzare l'applicazione grafica)
+}
+
+(Aggiungere nuove funzionalità)
+
+"Studente" --> (Imparare la chemiometria)
+"Analista forense" --> (Classificare i dati dalla scena del crimine)
+"Chiunque" --> (Fondere i dati)
+"Chiunque" --> (Importare ed esportare i dati)
+"Chiunque" --> (Creare grafici)
+"Chiunque" -up-> (Utilizzare l'applicazione grafica)
+"Utente di Colab" --> (Utilizzare la libreria da Colab)
+"Gestore del sistema" -up-> (Aggiungere nuove funzionalità)
+"Programmatore esterno" -down-> (Applicazione grafica) : <<implementa>>
+
+"Ambiente Colab" ..|> "Libreria Python" : <<usa>>
+"Applicazione grafica" ..|> "Libreria Python" : <<basato su>>
+
+(Aggiungere nuove funzionalità) -up-|> "Ambiente Colab"
+(Aggiungere nuove funzionalità) -up-|> "Libreria Python"
+@enduml