Skip to content

Commit

Permalink
New Helper Functions Added
Browse files Browse the repository at this point in the history
  • Loading branch information
abdozmantar committed Sep 27, 2024
1 parent e497d96 commit 26db838
Show file tree
Hide file tree
Showing 22 changed files with 624 additions and 166 deletions.
49 changes: 38 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

<p align="center">
<a href="https://badge.fury.io/py/zipml">
<img src="https://badge.fury.io/py/zipml.svg" alt="PyPI version" />
<img src="https://d25lcipzij17d.cloudfront.net/badge.svg?id=py&r=r&ts=1683906897&type=6e&v=0.2.3&x2=0" alt="PyPI version" />
</a>
<a href="https://github.com/abdozmantar/zipml/actions">
<img src="https://github.com/abdozmantar/zipml/actions/workflows/ci.yml/badge.svg" alt="Build Status" />
Expand Down Expand Up @@ -36,6 +36,9 @@
- **Model Comparison**: Compare the performance of different models with ease, providing metrics and visual feedback.
- **CLI Support**: Run machine learning tasks directly from the command line.
- **Extensible**: Add your own models and customize workflows as needed.
- **Visualization Tools**: Includes tools for visualizing model performance metrics, helping to understand model behavior better.
- **Hyperparameter Tuning**: Support for hyperparameter tuning to optimize model performance.
- **Data Preprocessing**: Built-in data preprocessing steps to handle missing values, scaling, and encoding.

## Installation

Expand All @@ -60,16 +63,34 @@ pip install .
Here's a practical example of how to use ZipML:

```python
from zipml import split_data, compare_models, save_confusion_matrix
from sklearn.datasets import load_iris
import pandas as pd
from zipml.model import analyze_model_predictions
from zipml.model import calculate_model_results
from zipml.visualization import save_and_plot_confusion_matrix
from zipml.data import split_data
from zipml import compare_models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

# Load data
data = load_iris()
X, y = data.data, data.target

# Split data
# Sample dataset
data = {
'feature_1': [0.517, 0.648, 0.105, 0.331, 0.781, 0.026, 0.048],
'feature_2': [0.202, 0.425, 0.643, 0.721, 0.646, 0.827, 0.303],
'feature_3': [0.897, 0.579, 0.014, 0.167, 0.015, 0.358, 0.744],
'feature_4': [0.457, 0.856, 0.376, 0.527, 0.648, 0.534, 0.047],
'feature_5': [0.046, 0.118, 0.222, 0.001, 0.969, 0.239, 0.203],
'target': [0, 1, 1, 1, 1, 1, 0]
}

# Creating DataFrame
df = pd.DataFrame(data)

# Splitting data into features (X) and target (y)
X = df.drop('target', axis=1)
y = df['target']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = split_data(X, y)

# Define models
Expand All @@ -79,12 +100,18 @@ models = [
GradientBoostingClassifier()
]

# Compare models
# Compare models and select the best one
best_model, performance = compare_models(models, X_train, X_test, y_train, y_test)
print(f"Best model: {best_model}")
print(f"Best model: {best_model} with performance: {performance}")

# Calculate performance metrics for the best model
best_model_metrics = calculate_model_results(y_test, best_model.predict(X_test))

# Analyze model predictions
val_df, most_wrong = analyze_model_predictions(best_model, X_test, y_test)

# Save confusion matrix
save_confusion_matrix(y_test, best_model.predict(X_test))
# Save and plot confusion matrix
save_and_plot_confusion_matrix(y_test, best_model.predict(X_test), save_path="confusion_matrix.png")
```

### CLI Usage
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name='zipml',
version='0.2.3',
version='0.2.4',
description='A simple AutoML tool for small datasets with useful helper functions',
author='Abdullah OZMANTAR',
author_email='abdullahozmntr@gmail.com',
Expand Down
7 changes: 5 additions & 2 deletions zipml/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
from .helpers import split_data, save_confusion_matrix, load_data, get_class_distribution, plot_class_distribution, plot_results
from .zipml import evaluate_model, optimize_hyperparameters, train_model,compare_models,save_model, load_model,predict, main
from .zipml import *
from .visualization import *
from .utils import *
from .model import *
from .data import *
2 changes: 2 additions & 0 deletions zipml/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .preprocessing import label_encode_labels, one_hot_encode_labels, split_data
from .file_operations import walk_through_dir, unzip_data, read_lines_from_file, load_data
86 changes: 86 additions & 0 deletions zipml/data/file_operations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import os
import pandas as pd
import zipfile
import logging
from typing import Optional, List

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def walk_through_dir(dir_path: str) -> pd.DataFrame:
"""
Walks through dir_path returning its contents as a Pandas DataFrame.
Args:
dir_path (str): Target directory.
Returns:
pd.DataFrame: A DataFrame containing dirpath, dirnames, and filenames.
"""
# Initialize lists to store directory information
dir_paths = [] # To store the path of each directory
dir_names = [] # To store the names of subdirectories
file_names = [] # To store the names of files (images)

# Walk through the directory structure
for dirpath, dirnames, filenames in os.walk(dir_path):
# Append the collected information to the lists
dir_paths.append(dirpath)
dir_names.append(dirnames)
file_names.append(filenames)

# Log the number of subdirectories and images in the current directory
logging.info(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")

# Create a DataFrame from the lists
df = pd.DataFrame({
'dir_path': dir_paths,
'dir_names': dir_names,
'file_names': file_names
})

return df # Return the DataFrame

def unzip_data(filename: str, extract_path: Optional[str] = None) -> None:
"""
Unzips the specified zip file into the current working directory or a specified path.
Args:
filename (str): The file path to the target zip file that needs to be unzipped.
extract_path (Optional[str]): The directory path where the contents should be extracted.
If None, the contents will be extracted to the current working directory.
"""
# Open the specified zip file in read mode
with zipfile.ZipFile(filename, "r") as zip_ref:
# Extract all contents to the specified directory or current working directory
zip_ref.extractall(extract_path if extract_path else ".")

def read_lines_from_file(filename: str) -> List[str]:
"""
Reads the contents of a text file and returns the lines as a list of strings.
Args:
filename (str): A string containing the path to the target text file.
Returns:
List[str]: A list of strings, where each string represents a line from the file.
"""
with open(filename, "r") as file:
return file.readlines() # Read all lines and return them as a list



def load_data(file_path: str) -> pd.DataFrame:
"""
Loads a dataset from a CSV file.
Parameters:
file_path (str): Path to the CSV file.
Returns:
DataFrame: Loaded data as a pandas DataFrame.
"""
logging.info(f"Loading dataset from {file_path}.")
return pd.read_csv(file_path)



2 changes: 2 additions & 0 deletions zipml/data/preprocessing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .encoding import label_encode_labels, one_hot_encode_labels
from .split_data import split_data
36 changes: 36 additions & 0 deletions zipml/data/preprocessing/encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import numpy as np
import pandas as pd
from sklearn.calibration import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


def one_hot_encode_labels(df: pd.DataFrame, column_name: str) -> np.ndarray:
"""
One-hot encodes the target labels from the specified column in the DataFrame.
Args:
df (pd.DataFrame): DataFrame containing the target column.
column_name (str): Name of the column to be one-hot encoded.
Returns:
np.ndarray: One-hot encoded labels.
"""
one_hot_encoder = OneHotEncoder(sparse_output=False)
labels = df[column_name].to_numpy().reshape(-1, 1)
return one_hot_encoder.fit_transform(labels)

def label_encode_labels(df: pd.DataFrame, column_name: str) -> tuple:
"""
Label encodes the target labels and returns both the encoded labels and the class names.
Args:
df (pd.DataFrame): DataFrame containing the target column.
column_name (str): Name of the column to be label encoded.
Returns:
tuple: (Encoded labels, Class names).
"""
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(df[column_name].to_numpy())
class_names = label_encoder.classes_
return labels_encoded, class_names
23 changes: 23 additions & 0 deletions zipml/data/preprocessing/split_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@

import logging
import pandas as pd
from typing import Any, Tuple, Union
from sklearn.model_selection import train_test_split


logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def split_data(X: Union[pd.DataFrame, Any], y: Any, test_size: float = 0.2) -> Tuple[Union[pd.DataFrame, Any], Union[pd.DataFrame, Any], Any, Any]:
"""
Splits data into training and testing sets.
Parameters:
X (DataFrame or array-like): Features.
y (array-like): Target labels.
test_size (float): Proportion of the dataset to include in the test split.
Returns:
tuple: Split datasets (X_train, X_test, y_train, y_test)
"""
logging.info(f"Splitting data with test size of {test_size}.")
return train_test_split(X, y, test_size=test_size, random_state=42)
103 changes: 0 additions & 103 deletions zipml/helpers.py

This file was deleted.

3 changes: 3 additions & 0 deletions zipml/model/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .analyze_model_predictions import analyze_model_predictions
from .measure_prediction_time import measure_prediction_time
from .calculate_model_results import calculate_model_results
Loading

0 comments on commit 26db838

Please sign in to comment.