New Helper Functions Added

abdozmantar · Sep 27, 2024 · 26db838 · 26db838
1 parent e497d96
commit 26db838
Show file tree

Hide file tree

Showing 22 changed files with 624 additions and 166 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 <p align="center">
   <a href="https://badge.fury.io/py/zipml">
-    <img src="https://badge.fury.io/py/zipml.svg" alt="PyPI version" />
+    <img src="https://d25lcipzij17d.cloudfront.net/badge.svg?id=py&r=r&ts=1683906897&type=6e&v=0.2.3&x2=0" alt="PyPI version" />
   </a>
   <a href="https://github.com/abdozmantar/zipml/actions">
       <img src="https://github.com/abdozmantar/zipml/actions/workflows/ci.yml/badge.svg" alt="Build Status" />
@@ -36,6 +36,9 @@
 - **Model Comparison**: Compare the performance of different models with ease, providing metrics and visual feedback.
 - **CLI Support**: Run machine learning tasks directly from the command line.
 - **Extensible**: Add your own models and customize workflows as needed.
+- **Visualization Tools**: Includes tools for visualizing model performance metrics, helping to understand model behavior better.
+- **Hyperparameter Tuning**: Support for hyperparameter tuning to optimize model performance.
+- **Data Preprocessing**: Built-in data preprocessing steps to handle missing values, scaling, and encoding.
 
 ## Installation
 
@@ -60,16 +63,34 @@ pip install .
 Here's a practical example of how to use ZipML:
 
 ```python
-from zipml import split_data, compare_models, save_confusion_matrix
-from sklearn.datasets import load_iris
+import pandas as pd
+from zipml.model import analyze_model_predictions
+from zipml.model import calculate_model_results
+from zipml.visualization import save_and_plot_confusion_matrix
+from zipml.data import split_data
+from zipml import compare_models
 from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
 from sklearn.linear_model import LogisticRegression
 
-# Load data
-data = load_iris()
-X, y = data.data, data.target
 
-# Split data
+# Sample dataset
+data = {
+    'feature_1': [0.517, 0.648, 0.105, 0.331, 0.781, 0.026, 0.048],
+    'feature_2': [0.202, 0.425, 0.643, 0.721, 0.646, 0.827, 0.303],
+    'feature_3': [0.897, 0.579, 0.014, 0.167, 0.015, 0.358, 0.744],
+    'feature_4': [0.457, 0.856, 0.376, 0.527, 0.648, 0.534, 0.047],
+    'feature_5': [0.046, 0.118, 0.222, 0.001, 0.969, 0.239, 0.203],
+    'target': [0, 1, 1, 1, 1, 1, 0]
+}
+
+# Creating DataFrame
+df = pd.DataFrame(data)
+
+# Splitting data into features (X) and target (y)
+X = df.drop('target', axis=1)
+y = df['target']
+
+# Split the data into training and test sets
 X_train, X_test, y_train, y_test = split_data(X, y)
 
 # Define models
@@ -79,12 +100,18 @@ models = [
     GradientBoostingClassifier()
 ]
 
-# Compare models
+# Compare models and select the best one
 best_model, performance = compare_models(models, X_train, X_test, y_train, y_test)
-print(f"Best model: {best_model}")
+print(f"Best model: {best_model} with performance: {performance}")
+
+# Calculate performance metrics for the best model
+best_model_metrics = calculate_model_results(y_test, best_model.predict(X_test))
+
+# Analyze model predictions
+val_df, most_wrong = analyze_model_predictions(best_model, X_test, y_test)
 
-# Save confusion matrix
-save_confusion_matrix(y_test, best_model.predict(X_test))
+# Save and plot confusion matrix
+save_and_plot_confusion_matrix(y_test, best_model.predict(X_test), save_path="confusion_matrix.png")
 ```
 
 ### CLI Usage

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='zipml',
-    version='0.2.3',
+    version='0.2.4',
     description='A simple AutoML tool for small datasets with useful helper functions',
     author='Abdullah OZMANTAR',
     author_email='abdullahozmntr@gmail.com',

diff --git a/zipml/__init__.py b/zipml/__init__.py
@@ -1,2 +1,5 @@
-from .helpers import split_data, save_confusion_matrix, load_data, get_class_distribution, plot_class_distribution, plot_results
-from .zipml import evaluate_model, optimize_hyperparameters, train_model,compare_models,save_model, load_model,predict, main 
+from .zipml import *
+from .visualization import *
+from .utils import *
+from .model import *
+from .data import *
diff --git a/zipml/data/__init__.py b/zipml/data/__init__.py
@@ -0,0 +1,2 @@
+from .preprocessing import label_encode_labels, one_hot_encode_labels, split_data
+from .file_operations import walk_through_dir, unzip_data, read_lines_from_file, load_data
diff --git a/zipml/data/file_operations.py b/zipml/data/file_operations.py
@@ -0,0 +1,86 @@
+import os
+import pandas as pd
+import zipfile
+import logging
+from typing import Optional, List
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+def walk_through_dir(dir_path: str) -> pd.DataFrame:
+    """
+    Walks through dir_path returning its contents as a Pandas DataFrame.
+
+    Args:
+        dir_path (str): Target directory.
+  
+    Returns:
+        pd.DataFrame: A DataFrame containing dirpath, dirnames, and filenames.
+    """
+    # Initialize lists to store directory information
+    dir_paths = []  # To store the path of each directory
+    dir_names = []  # To store the names of subdirectories
+    file_names = []  # To store the names of files (images)
+
+    # Walk through the directory structure
+    for dirpath, dirnames, filenames in os.walk(dir_path):
+        # Append the collected information to the lists
+        dir_paths.append(dirpath)
+        dir_names.append(dirnames)
+        file_names.append(filenames)
+
+        # Log the number of subdirectories and images in the current directory
+        logging.info(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")
+
+    # Create a DataFrame from the lists
+    df = pd.DataFrame({
+        'dir_path': dir_paths,
+        'dir_names': dir_names,
+        'file_names': file_names
+    })
+
+    return df  # Return the DataFrame
+
+def unzip_data(filename: str, extract_path: Optional[str] = None) -> None:
+    """
+    Unzips the specified zip file into the current working directory or a specified path.
+
+    Args:
+        filename (str): The file path to the target zip file that needs to be unzipped.
+        extract_path (Optional[str]): The directory path where the contents should be extracted.
+                                       If None, the contents will be extracted to the current working directory.
+    """
+    # Open the specified zip file in read mode
+    with zipfile.ZipFile(filename, "r") as zip_ref:
+        # Extract all contents to the specified directory or current working directory
+        zip_ref.extractall(extract_path if extract_path else ".")
+
+def read_lines_from_file(filename: str) -> List[str]:
+    """
+    Reads the contents of a text file and returns the lines as a list of strings.
+
+    Args:
+        filename (str): A string containing the path to the target text file.
+
+    Returns:
+        List[str]: A list of strings, where each string represents a line from the file.
+    """
+    with open(filename, "r") as file:
+        return file.readlines()  # Read all lines and return them as a list
+
+
+
+def load_data(file_path: str) -> pd.DataFrame:
+    """
+    Loads a dataset from a CSV file.
+    
+    Parameters:
+        file_path (str): Path to the CSV file.
+    
+    Returns:
+        DataFrame: Loaded data as a pandas DataFrame.
+    """
+    logging.info(f"Loading dataset from {file_path}.")
+    return pd.read_csv(file_path)
+
+
+
diff --git a/zipml/data/preprocessing/__init__.py b/zipml/data/preprocessing/__init__.py
@@ -0,0 +1,2 @@
+from .encoding import label_encode_labels, one_hot_encode_labels
+from .split_data import split_data
diff --git a/zipml/data/preprocessing/encoding.py b/zipml/data/preprocessing/encoding.py
@@ -0,0 +1,36 @@
+import numpy as np
+import pandas as pd
+from sklearn.calibration import LabelEncoder
+from sklearn.preprocessing import OneHotEncoder
+
+
+def one_hot_encode_labels(df: pd.DataFrame, column_name: str) -> np.ndarray:
+    """
+    One-hot encodes the target labels from the specified column in the DataFrame.
+
+    Args:
+        df (pd.DataFrame): DataFrame containing the target column.
+        column_name (str): Name of the column to be one-hot encoded.
+
+    Returns:
+        np.ndarray: One-hot encoded labels.
+    """
+    one_hot_encoder = OneHotEncoder(sparse_output=False)
+    labels = df[column_name].to_numpy().reshape(-1, 1)
+    return one_hot_encoder.fit_transform(labels)
+
+def label_encode_labels(df: pd.DataFrame, column_name: str) -> tuple:
+    """
+    Label encodes the target labels and returns both the encoded labels and the class names.
+
+    Args:
+        df (pd.DataFrame): DataFrame containing the target column.
+        column_name (str): Name of the column to be label encoded.
+
+    Returns:
+        tuple: (Encoded labels, Class names).
+    """
+    label_encoder = LabelEncoder()
+    labels_encoded = label_encoder.fit_transform(df[column_name].to_numpy())
+    class_names = label_encoder.classes_
+    return labels_encoded, class_names
diff --git a/zipml/data/preprocessing/split_data.py b/zipml/data/preprocessing/split_data.py
@@ -0,0 +1,23 @@
+
+import logging
+import pandas as pd
+from typing import Any, Tuple, Union
+from sklearn.model_selection import train_test_split
+
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+def split_data(X: Union[pd.DataFrame, Any], y: Any, test_size: float = 0.2) -> Tuple[Union[pd.DataFrame, Any], Union[pd.DataFrame, Any], Any, Any]:
+    """
+    Splits data into training and testing sets.
+    
+    Parameters:
+        X (DataFrame or array-like): Features.
+        y (array-like): Target labels.
+        test_size (float): Proportion of the dataset to include in the test split.
+    
+    Returns:
+        tuple: Split datasets (X_train, X_test, y_train, y_test)
+    """
+    logging.info(f"Splitting data with test size of {test_size}.")
+    return train_test_split(X, y, test_size=test_size, random_state=42)
diff --git a/zipml/helpers.py b/zipml/helpers.py
diff --git a/zipml/model/__init__.py b/zipml/model/__init__.py
@@ -0,0 +1,3 @@
+from .analyze_model_predictions import analyze_model_predictions
+from .measure_prediction_time import measure_prediction_time
+from .calculate_model_results import calculate_model_results
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .preprocessing import label_encode_labels, one_hot_encode_labels, split_data
		from .file_operations import walk_through_dir, unzip_data, read_lines_from_file, load_data
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .encoding import label_encode_labels, one_hot_encode_labels
		from .split_data import split_data