feat(DF): add file-like input capabilities

f-aguzzi · Jun 29, 2024 · e5c707b · e5c707b
1 parent 5573f1a
commit e5c707b
Showing 1 changed file with 45 additions and 33 deletions.
diff --git a/src/chemfusekit/df.py b/src/chemfusekit/df.py
@@ -1,5 +1,5 @@
 """Performs low-level data fusion on input arrays, outputs the results"""
-from typing import Optional, List
+from typing import Optional, List, IO
 
 import numpy as np
 import pandas as pd
@@ -16,7 +16,7 @@
 class Table:
     """Holds the path, preprocessing choice and sheet name for a single Excel table."""
 
-    def __init__(self, file_path: str, sheet_name: str, preprocessing: str, feature_selection: str | None = None,
+    def __init__(self, file_path: str | IO, sheet_name: str, preprocessing: str, feature_selection: str | None = None,
                  class_column: str = 'Substance', index_column: str | None = None):
         self.file_path = file_path
         self.sheet_name = sheet_name
@@ -133,21 +133,35 @@ def fuse(self):
                     ax1.set_title(f'Original data')
                     ax2.plot(wl, preprocessed_x.T)
                     ax2.set_title(f'Processed table with {table.preprocessing}')
-                    if table.file_path.endswith('.xlsx'):
-                        fig.suptitle(f'Imported table: {table.sheet_name} from {table.file_path}')
+                    if isinstance(table.file_path, str):
+                        if table.file_path.endswith('.xlsx'):
+                            fig.suptitle(f'Imported table: {table.sheet_name} from {table.file_path}')
+                        else:
+                            fig.suptitle(f'Imported table: {table.file_path}')
                     else:
-                        fig.suptitle(f'Imported table: {table.file_path}')
+                        file_path = table.file_path.name
+                        if file_path.endswith('.xlsx'):
+                            fig.suptitle(f'Imported table: {table.sheet_name} from {file_path}')
+                        else:
+                            fig.suptitle(f'Imported table: {file_path}') 
                 else:
                     # Let's plot the different datasets we preprocessed
                     fig, ax1 = plt.subplots(1, figsize=(15, 15))
                     if x.shape[1] == 1:
                         ax1.plot(x)
                     else:
                         ax1.plot(wl, x.T)
-                    if table.file_path.endswith('.xlsx'):
-                        fig.suptitle(f'Imported table: {table.sheet_name} from {table.file_path} (no preprocessing)')
+                    if isinstance(table.file_path, str):
+                        if table.file_path.endswith('.xlsx'):
+                            fig.suptitle(f'Imported table: {table.sheet_name} from {table.file_path} (no preprocessing')
+                        else:
+                            fig.suptitle(f'Imported table: {table.file_path}  (no preprocessing)')
                     else:
-                        fig.suptitle(f'Imported table: {table.file_path} (no preprocessing)')
+                        file_path = table.file_path.name
+                        if file_path.endswith('.xlsx'):
+                            fig.suptitle(f'Imported table: {table.sheet_name} from {file_path}')
+                        else:
+                            fig.suptitle(f'Imported table: {file_path}') 
 
             # Create a new DataFrame with the processed numerical attributes
             processed_dataframe_x = pd.DataFrame(
@@ -230,31 +244,29 @@ def _perform_feature_selection(table: Table, data_model: BaseDataModel) -> BaseD
 
     @staticmethod
     def _import_table(file_path, sheet_name) -> pd.DataFrame:
-        """Imports a table from a file"""
-        try:
-            # Autodetect the format based on the file extension
-            if file_path.endswith('.xlsx'):
-                table_data = pd.read_excel(
-                    file_path,
-                    sheet_name=sheet_name,
-                    index_col=0,
-                    header=0
-                )
-            elif file_path.endswith('.csv'):
-                table_data = pd.read_csv(
-                    file_path,
-                    index_col=0,
-                    header=0
-                )
-            elif file_path.endswith('.json'):
-                table_data = pd.read_json(
-                    file_path,
-                    orient='table'  # or other orientations based on your json format
-                )
-            else:
-                raise ValueError(f"Unsupported file format: {file_path}")
-        except Exception as exc:
-            raise FileNotFoundError("Error opening the selected files.") from exc
+        """Imports a table from a file or file-like object"""
+        if isinstance(file_path, IO):
+            # Handle file-like objects
+            try:
+                table_data = pd.read_excel(file_path, sheet_name=sheet_name, index_col=0, header=0)
+            except Exception as exc:
+                raise ValueError("Error reading the file-like object.") from exc
+        elif isinstance(file_path, str):
+            # Handle file paths
+            try:
+                # Autodetect the format based on the file extension
+                if file_path.endswith('.xlsx'):
+                    table_data = pd.read_excel(file_path, sheet_name=sheet_name, index_col=0, header=0)
+                elif file_path.endswith('.csv'):
+                    table_data = pd.read_csv(file_path, index_col=0, header=0)
+                elif file_path.endswith('.json'):
+                    table_data = pd.read_json(file_path, orient='table')
+                else:
+                    raise ValueError(f"Unsupported file format: {file_path}")
+            except Exception as exc:
+                raise FileNotFoundError("Error opening the selected files.") from exc
+        else:
+            raise TypeError("Unsupported file type. Expected str or IO.")
 
         return table_data