diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..26b5cc7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,32 @@ +--- +name: Bug report +about: Create a report to help us improve +--- + + + +# Bug Report + +Bug: X does not work when I do Y + +## Description + +Info about the bug goes here. + +### Steps to Reproduce + +1. Step 1 +2. Step 2 +3. ... + +### Expected Result + +I was expecting ... + +You may write the expected result or add a screenshot. + +### Actual Results + +I actually got ... + +Would be awesome to link screenshots here and/or error messages received. \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/issue.md b/.github/ISSUE_TEMPLATE/issue.md new file mode 100644 index 0000000..b047619 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/issue.md @@ -0,0 +1,14 @@ +--- +name: Task +about: A small issue t. It will usually be labeled as `good first issue` or `enhancement`. +--- + + + +# Task Title + +Task: I am an Issue + +## Task Description + +This issue will... \ No newline at end of file diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..573af37 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,16 @@ +# Story Title + +[This is the Issue Title](https://github.com/username/repository-name/issues/1) + +## Changes made + +- made this +- did that + +## How does the solution address the problem + +This PR will... + +## Linked issues + +Resolves #1 \ No newline at end of file diff --git a/.github/workflows/development_CI.yaml b/.github/workflows/development_CI.yaml new file mode 100644 index 0000000..31f1d6e --- /dev/null +++ b/.github/workflows/development_CI.yaml @@ -0,0 +1,38 @@ +# Runs CI when pushing to develop branch +# runs pylint and pytest + +name: CI_develop_action + +on: + push: + branches: [ develop ] + pull_request: + branches: [ develop ] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install -r requirements.txt + python -m pip install pylint pytest pytest-mock pytest-cov + + - name: Test with pytest + run: | + pytest --cov=cobra tests/ + + # until we refactor accordingly + #- name: Lint check with pylint + # run: | + # pylint cobra diff --git a/.gitignore b/.gitignore index b2a2749..edb9762 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -#Ignoired directories in root folder +#Ignored directories in root folder # Byte-compiled / optimized / DLL files @@ -109,3 +109,4 @@ ENV/ # Other ignore files *.pptx *.ppt +.idea/ diff --git a/README.rst b/README.rst index f3a3738..b70a718 100644 --- a/README.rst +++ b/README.rst @@ -1,87 +1,89 @@ -===== -cobra -===== - -**cobra** is a Python package to build predictive models using logistic regression with a focus on performance and interpretation. It consists of several modules for data preprocessing, feature selection and model evaluation. The underlying methodology was developed at Python Predictions in the course of hundreds of business-related prediction challenges. It has been tweaked, tested and optimized over the years based on feedback from clients, our team, and academic researchers. - - -Main Features -============= - -- Prepare a given pandas DataFrame for predictive modelling: - - - partition into train/selection/validation sets - - create bins from continuous variables - - regroup categorical variables based on statistical significance - - replace missing values and - - add columns with incidence rate per category/bin - -- Perform univariate feature selection based on AUC -- Compute correlation matrix of predictors -- Find the suitable variables using forward feature selection -- Evaluate model performance and visualize the results - -Getting started -=============== - -These instructions will get you a copy of the project up and running on your local machine for usage, development and testing purposes. - -Requirements ------------- - -This package requires the usual Python packages for data science: - -- numpy (>=1.19.4) -- pandas (>=1.1.5) -- scipy (>=1.5.4) -- scikit-learn (>=0.23.1) -- matplotlib (>=3.3.3) -- seaborn (>=0.11.0) - - -These packages, along with their versions are listed in ``requirements.txt`` and can be installed using ``pip``: :: - - - pip install -r requirements.txt - - -**Note**: if you want to install cobra with e.g. pip, you don't have to install all of these requirements as these are automatically installed with cobra itself. - -Installation ------------- - -The easiest way to install cobra is using ``pip`` :: - - pip install -U pythonpredictions-cobra - -Contributing to cobra -===================== - -We'd love you to contribute to the development of cobra! There are many ways in which you can contribute, the most common of which is to contribute to the source code or documentation of the project. However, there are many other ways you can contribute (report issues, improve code coverage by adding unit tests, ...). -We use GitHub issue to track all bugs and feature requests. Feel free to open an issue in case you found a bug or in case you wish to see a new feature added. - -How to contribute code ----------------------- - -The preferred way to contribute to cobra is to fork the main repository on GitHub, then submit a "pull request" (PR). The first step is to get a local development copy by installing cobra from source through the following steps: - -- Fork the `project repository `_. For more details on how to fork a repository see `this guide `__ -- Clone your fork of cobra's repo. -- Open a shell and navigate to the folder where this repo was cloned in. -- Once you are in the folder, execute ``pip install --editable .``. -- Create a *feature branch* to do your development. -- Once your are finished developing, you can create a *pull request* from your fork (see `this guide `__ for detailed instructions). - -**Notes** - -- Make sure to follow the *PEP 8* styleguide if you make any changes to cobra. You should also write or modify unit test for your changes. -- To avoid duplicating work, it is highly recommended that you search through the issue tracker and/or the PR list. If in doubt, you can always reach out to us through email (cobra@pythonpredictions.com) - -Help and Support -================ - -Documentation -------------- - -- HTML documentation of the `individual modules `_ -- A step-by-step `tutorial `_ + + +.. image:: https://img.shields.io/pypi/v/pythonpredictions-cobra.svg + :target: https://pypi.org/project/pythonpredictions-cobra/ +.. image:: https://img.shields.io/pypi/dm/pythonpredictions-cobra.svg + :target: https://pypistats.org/packages/pythonpredictions-cobra +.. image:: https://github.com/PythonPredictions/cobra/actions/workflows/development_CI.yaml/badge.svg?branch=develop + :target: https://github.com/PythonPredictions/cobra/actions/workflows/development_CI.yaml + +------------------------------------------------------------------------------------------------------------------------------------ + +===== +cobra +===== +.. image:: material\logo.png + :width: 300 + +**cobra** is a Python package to build predictive models using linear/logistic regression with a focus on performance and interpretation. It consists of several modules for data preprocessing, feature selection and model evaluation. The underlying methodology was developed at Python Predictions in the course of hundreds of business-related prediction challenges. It has been tweaked, tested and optimized over the years based on feedback from clients, our team, and academic researchers. + +Main Features +============= + +- Prepare a given pandas DataFrame for predictive modelling: + + - partition into train/selection/validation sets + - create bins from continuous variables + - regroup categorical variables based on statistical significance + - replace missing values and + - add columns with incidence rate per category/bin + +- Perform univariate feature selection based on AUC +- Compute correlation matrix of predictors +- Find the suitable variables using forward feature selection +- Evaluate model performance and visualize the results + +Getting started +=============== + +These instructions will get you a copy of the project up and running on your local machine for usage, development and testing purposes. + +Requirements +------------ + +This package requires the usual Python packages for data science: + +- numpy (>=1.19.4) +- pandas (>=1.1.5) +- scipy (>=1.5.4) +- scikit-learn (>=0.23.1) +- matplotlib (>=3.3.3) +- seaborn (>=0.11.0) + + +These packages, along with their versions are listed in ``requirements.txt`` and can be installed using ``pip``: :: + + + pip install -r requirements.txt + + +**Note**: if you want to install cobra with e.g. pip, you don't have to install all of these requirements as these are automatically installed with cobra itself. + +Installation +------------ + +The easiest way to install cobra is using ``pip``: :: + + pip install -U pythonpredictions-cobra + +Contributing to cobra +===================== + +We'd love you to contribute to the development of cobra! There are many ways in which you can contribute, the most common of which is to contribute to the source code or documentation of the project. However, there are many other ways you can contribute (report issues, improve code coverage by adding unit tests, ...). +We use GitHub issue to track all bugs and feature requests. Feel free to open an issue in case you found a bug or in case you wish to see a new feature added. + +For more details, check our `wiki `_. + +Help and Support +================ + +Documentation +------------- + +- HTML documentation of the `individual modules `_ +- A step-by-step `tutorial `_ + +Outreach +------------- + +- Check out the Data Science Leuven Meetup `talk `_ by one of the core developers (second presentation) diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py index 49e92df..b5ac92f 100644 --- a/cobra/evaluation/evaluator.py +++ b/cobra/evaluation/evaluator.py @@ -35,15 +35,20 @@ class Evaluator(): probability_cutoff : float probability cut off to convert probability scores to a binary score roc_curve : dict - map containing true-positive-rate, false-positve-rate at various + map containing true-positive-rate, false-positive-rate at various thresholds (also incl.) + n_bins : int, optional + defines the number of bins used to calculate the lift curve for + (by default 10, so deciles) """ def __init__(self, probability_cutoff: float=None, - lift_at: float=0.05): + lift_at: float=0.05, + n_bins: int = 10): self.lift_at = lift_at self.probability_cutoff = probability_cutoff + self.n_bins = n_bins # Placeholder to store fitted output self.scalar_metrics = None @@ -85,7 +90,7 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray): self.roc_curve = {"fpr": fpr, "tpr": tpr, "thresholds": thresholds} self.confusion_matrix = confusion_matrix(y_true, y_pred_b) - self.lift_curve = Evaluator._compute_lift_per_decile(y_true, y_pred) + self.lift_curve = Evaluator._compute_lift_per_bin(y_true, y_pred, self.n_bins) self.cumulative_gains = Evaluator._compute_cumulative_gains(y_true, y_pred) @@ -199,8 +204,7 @@ def plot_confusion_matrix(self, path: str=None, dim: tuple=(12, 8), plt.show() - def plot_cumulative_response_curve(self, path: str=None, - dim: tuple=(12, 8)): + def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)): """Plot cumulative response curve Parameters @@ -430,10 +434,11 @@ def _compute_cumulative_gains(y_true: np.ndarray, return percentages, gains @staticmethod - def _compute_lift_per_decile(y_true: np.ndarray, - y_pred: np.ndarray) -> tuple: - """Compute lift of the model per decile, returns x-labels, lifts and - the target incidence to create cummulative response curves + def _compute_lift_per_bin(y_true: np.ndarray, + y_pred: np.ndarray, + n_bins: int = 10) -> tuple: + """Compute lift of the model for a given number of bins, returns x-labels, + lifts and the target incidence to create cumulative response curves Parameters ---------- @@ -441,6 +446,9 @@ def _compute_lift_per_decile(y_true: np.ndarray, True binary target data labels y_pred : np.ndarray Target scores of the model + n_bins : int, optional + defines the number of bins used to calculate the lift curve for + (by default 10, so deciles) Returns ------- @@ -451,7 +459,7 @@ def _compute_lift_per_decile(y_true: np.ndarray, lifts = [Evaluator._compute_lift(y_true=y_true, y_pred=y_pred, lift_at=perc_lift) - for perc_lift in np.arange(0.1, 1.1, 0.1)] + for perc_lift in np.linspace(1/n_bins, 1, num=n_bins, endpoint=True)] x_labels = [len(lifts)-x for x in np.arange(0, len(lifts), 1)] diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py index 41959d7..3e39411 100644 --- a/cobra/evaluation/pigs_tables.py +++ b/cobra/evaluation/pigs_tables.py @@ -1,8 +1,8 @@ -# third party imports import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np +from matplotlib.ticker import FuncFormatter import cobra.utils as utils @@ -91,26 +91,22 @@ def compute_pig_table(data: pd.DataFrame, def plot_incidence(df: pd.DataFrame, variable: str, - column_order: list=None, dim: tuple=(12, 8)): + column_order: list = None, dim: tuple = (12, 8)): """Function plots Predictor Incidence Graphs (PIGs). Bins are ordered in descening order of bin incidence unless specified otherwise with `column_order` list. - Parameters ---------- df: pd.DataFrame dataframe with cleaned, binned, partitioned and prepared data - variable: str variable for which the incidence plot will be shown - column_order: list, default=None explicit order of variable - dim: tuple, default=(12, 8) tuple with width and lentgh of the plot """ - df_plot = df[df['variable'] == variable] + df_plot = df[df['variable'] == variable].copy() if column_order is not None: @@ -131,41 +127,74 @@ def plot_incidence(df: pd.DataFrame, variable: str, with plt.style.context("seaborn-ticks"): fig, ax = plt.subplots(figsize=dim) - # First Axis - ax.bar(df_plot['label'], df_plot['pop_size'], - align='center', color="cornflowerblue") - ax.set_ylabel('population size', fontsize=16) + # ----------------- + # Left axis - incidence + # ----------------- + ax.plot(df_plot['label'], df_plot['incidence'], + color="#00ccff", marker=".", + markersize=20, linewidth=3, label='incidence rate per bin', + zorder=10) + + ax.plot(df_plot['label'], df_plot['avg_incidence'], + color="#022252", linestyle='--', linewidth=4, + label='average incidence rate', + zorder=10) + + # dummy line to have label on second axis from first + ax.plot(np.nan, "#939598", linewidth=6, label='bin size') + + # set labels & ticks + ax.set_ylabel('incidence', fontsize=16) ax.set_xlabel('{} bins' ''.format(variable), fontsize=16) ax.xaxis.set_tick_params(rotation=45, labelsize=14) ax.yaxis.set_tick_params(labelsize=14) - max_inc = max(df_plot['incidence']) + ax.set_yticks(np.arange(0, max(df_plot['incidence'])+0.05, 0.05)) + ax.yaxis.set_major_formatter( + FuncFormatter(lambda y, _: '{:.1%}'.format(y))) + + # removes ticks but keeps the labels + ax.tick_params(axis='both', which='both', length=0) + ax.tick_params(axis='y', colors="#00ccff") + ax.yaxis.label.set_color('#00ccff') - # Second Axis + # ----------------- + # Right Axis - bins + # ----------------- ax2 = ax.twinx() - plt.plot(df_plot['incidence'], color="darkorange", marker=".", - markersize=20, linewidth=3, label='incidence rate per bin') - plt.plot(df_plot['avg_incidence'], color="dimgrey", linewidth=4, - linestyle='--', - label='average incidence rate') + ax2.bar(df_plot['label'], df_plot['pop_size'], + align='center', color="#939598", zorder=1) - # dummy line to have label on second axis from first - ax2.plot(np.nan, "cornflowerblue", linewidth=6, label='bin size') - ax2.set_yticks(np.arange(0, max_inc+0.05, 0.05)) - ax2.set_yticklabels( - ['{:3.1f}%'.format(x*100) for x in ax2.get_yticks()]) + # set labels & ticks + ax2.set_ylabel('population size', fontsize=16) + ax2.set_xlabel('{} bins' ''.format(variable), fontsize=16) + ax2.xaxis.set_tick_params(rotation=45, labelsize=14) ax2.yaxis.set_tick_params(labelsize=14) - ax2.set_ylabel('incidence', fontsize=16) + ax2.yaxis.set_major_formatter( + FuncFormatter(lambda y, _: '{:.1%}'.format(y))) + + ax2.tick_params(axis='y', colors="#939598") + ax2.yaxis.label.set_color('#939598') + # Despine & prettify sns.despine(ax=ax, right=True, left=True) sns.despine(ax=ax2, left=True, right=False) ax2.spines['right'].set_color('white') ax2.grid(False) + # title & legend fig.suptitle('Incidence Plot - ' + variable, fontsize=22, y=1.02) - ax2.legend(frameon=False, bbox_to_anchor=(0., 1.01, 1., .102), - loc=3, ncol=1, mode="expand", borderaxespad=0., - prop={"size": 14}) + ax.legend(frameon=False, bbox_to_anchor=(0., 1.01, 1., .102), + loc=3, ncol=1, mode="expand", borderaxespad=0., + prop={"size": 14}) + + # Sets order ot layers + ax.set_zorder(1) + ax.patch.set_visible(False) + + del df_plot + + # Show plt.show() diff --git a/cobra/evaluation/plotting_utils.py b/cobra/evaluation/plotting_utils.py index 2bf0305..99a673f 100644 --- a/cobra/evaluation/plotting_utils.py +++ b/cobra/evaluation/plotting_utils.py @@ -14,17 +14,17 @@ def plot_univariate_predictor_quality(df_auc: pd.DataFrame, Parameters ---------- df_auc : pd.DatFrame - Contains for each variable the train auc and selection auc allong with + Contains for each variable the train auc and selection auc along with a boolean indicating whether or not it is selected based on the criteria dim : tuple, optional - tuple with width and lentgh of the plot + tuple with width and length of the plot path : str, optional path to store the figure """ df = (df_auc[df_auc["preselection"]] - .sort_values(by='AUC train', ascending=False)) + .sort_values(by='AUC selection', ascending=False)) df = pd.melt(df, id_vars=["predictor"], value_vars=["AUC train", "AUC selection"], @@ -60,7 +60,7 @@ def plot_correlation_matrix(df_corr: pd.DataFrame, df_corr : pd.DataFrame Correlation matrix dim : tuple, optional - tuple with width and lentgh of the plot + tuple with width and length of the plot path : str, optional path to store the figure """ @@ -89,7 +89,7 @@ def plot_performance_curves(model_performance: pd.DataFrame, contains train-selection-validation performance for each model trained in the forward feature selection dim : tuple, optional - tuple with width and lentgh of the plot + tuple with width and length of the plot path : str, optional path to store the figure """ @@ -141,7 +141,7 @@ def plot_variable_importance(df_variable_importance: pd.DataFrame, title : str, optional Title of the plot dim : tuple, optional - tuple with width and lentgh of the plot + tuple with width and length of the plot path : str, optional path to store the figure """ diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py index 8046b28..58b7620 100644 --- a/cobra/model_building/forward_selection.py +++ b/cobra/model_building/forward_selection.py @@ -1,10 +1,12 @@ import logging -log = logging.getLogger(__name__) import pandas as pd +from tqdm.auto import tqdm from cobra.model_building import LogisticRegressionModel as MLModel +log = logging.getLogger(__name__) + class ForwardFeatureSelection: @@ -159,7 +161,7 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str, def _forward_selection(self, train_data: pd.DataFrame, target_column_name: str, predictors: list, forced_predictors: list=[]) -> list: - """Perform the forward feature selection algoritm to compute a list + """Perform the forward feature selection algorithm to compute a list of models (with increasing performance?). The length of the list, i.e. the number of models is bounded by the max_predictors class attribute. @@ -186,7 +188,8 @@ def _forward_selection(self, train_data: pd.DataFrame, max_steps = 1 + min(self.max_predictors, len(predictors) + len(forced_predictors)) - for step in range(1, max_steps): + for step in tqdm(range(1, max_steps), desc="Sequentially adding best " + "predictor..."): if step <= len(forced_predictors): # first, we go through forced predictors candidate_predictors = [var for var in forced_predictors diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py index 68c41a3..c24a550 100644 --- a/cobra/preprocessing/categorical_data_processor.py +++ b/cobra/preprocessing/categorical_data_processor.py @@ -17,18 +17,18 @@ # standard lib imports import re from typing import Optional - import logging -log = logging.getLogger(__name__) # third party imports import numpy as np import pandas as pd from scipy import stats - +from tqdm.auto import tqdm from sklearn.base import BaseEstimator from sklearn.exceptions import NotFittedError +log = logging.getLogger(__name__) + class CategoricalDataProcessor(BaseEstimator): """ @@ -58,12 +58,12 @@ class CategoricalDataProcessor(BaseEstimator): "category_size_threshold", "p_value_threshold", "scale_contingency_table", "forced_categories"] - def __init__(self, regroup: bool=True, regroup_name: str="Other", - keep_missing: bool=True, - category_size_threshold: int=5, - p_value_threshold: float=0.001, - scale_contingency_table: bool=True, - forced_categories: dict={}): + def __init__(self, regroup: bool = True, regroup_name: str = "Other", + keep_missing: bool = True, + category_size_threshold: int = 5, + p_value_threshold: float = 0.001, + scale_contingency_table: bool = True, + forced_categories: dict = {}): self.regroup = regroup self.regroup_name = regroup_name @@ -149,7 +149,8 @@ def fit(self, data: pd.DataFrame, column_names: list, log.info("regroup was set to False, so no fitting is required") return None - for column_name in column_names: + for column_name in tqdm(column_names, desc="Fitting category " + "regrouping..."): if column_name not in data.columns: log.warning("DataFrame has no column '{}', so it will be " @@ -182,6 +183,11 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, list list of categories to combine into a category "Other" """ + if len(data[column_name].unique()) == 1: + log.warning(f"Predictor {column_name} is constant" + " and will be ignored in computation.") + return set(data[column_name].unique()) + y = data[target_column] incidence = y.mean() @@ -305,7 +311,8 @@ def _transform_column(self, data: pd.DataFrame, data.loc[:, column_name_clean] = (CategoricalDataProcessor ._replace_categories( data[column_name_clean], - categories)) + categories, + self.regroup_name)) # change data to categorical data.loc[:, column_name_clean] = (data[column_name_clean] @@ -366,7 +373,7 @@ def _get_small_categories(predictor_series: pd.Series, @staticmethod def _replace_missings(data: pd.DataFrame, - column_names: Optional[list]=None) -> pd.DataFrame: + column_names: Optional[list] = None) -> pd.DataFrame: """Replace missing values (incl empty strings) Parameters @@ -398,23 +405,25 @@ def _replace_missings(data: pd.DataFrame, @staticmethod def _compute_p_value(X: pd.Series, y: pd.Series, category: str, scale_contingency_table: bool) -> float: - """Summary + """Calculates p-value in contingency table (chi-square test) in + order to evaluate whether category of interest is significantly + different from the rest of the categories, given the target variable. Parameters ---------- X : pd.Series - Description + Variables data. y : pd.Series - Description + Target data. category : str - Description + Category for which we carry out the test scale_contingency_table : bool - Description + Whether we scale contingency table with incidence rate Returns ------- float - Description + p-value of chi-square test """ df = pd.concat([X, y], axis=1) df["other_categories"] = np.where(X == category, 0, 1) @@ -434,20 +443,24 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str, return stats.chi2_contingency(contigency_table, correction=False)[1] @staticmethod - def _replace_categories(data: pd.Series, categories: set) -> pd.Series: + def _replace_categories(data: pd.Series, categories: set, + replace_with: str) -> pd.Series: """replace categories in set with "Other" and transform the remaining categories to strings to avoid type errors later on in the pipeline Parameters ---------- data : pd.Series - Description + Dataset which contains the variable to be replaced categories : set - Description + Cleaned categories. + replace_with: str + String to be used as replacement for category. Returns ------- pd.Series - Description + Series with replaced categories """ - return data.apply(lambda x: str(x) if x in categories else "Other") + return data.apply( + lambda x: str(x) if x in categories else replace_with) diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index 8a802d9..0fbb29d 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -16,17 +16,17 @@ from copy import deepcopy from typing import List import numbers - import logging -log = logging.getLogger(__name__) +import math # third party imports import numpy as np import pandas as pd - +from tqdm.auto import tqdm from sklearn.base import BaseEstimator from sklearn.exceptions import NotFittedError -#from sklearn.cluster import KMeans + +log = logging.getLogger(__name__) class KBinsDiscretizer(BaseEstimator): @@ -70,12 +70,12 @@ class KBinsDiscretizer(BaseEstimator): "starting_precision", "label_format", "change_endpoint_format"] - def __init__(self, n_bins: int=10, strategy: str="quantile", - closed: str="right", - auto_adapt_bins: bool=False, - starting_precision: int=0, - label_format: str="{} - {}", - change_endpoint_format: bool=False): + def __init__(self, n_bins: int = 10, strategy: str = "quantile", + closed: str = "right", + auto_adapt_bins: bool = False, + starting_precision: int = 0, + label_format: str = "{} - {}", + change_endpoint_format: bool = False): # validate number of bins self._validate_n_bins(n_bins) @@ -163,7 +163,7 @@ def set_attributes_from_dict(self, params: dict): self.set_params(**params) self._bins_by_column = { - key: ([tuple(l) for l in value] if value else None) + key: ([tuple(v) for v in value] if value else None) for key, value in _bins_by_column.items() } @@ -186,7 +186,8 @@ def fit(self, data: pd.DataFrame, column_names: list): .format(KBinsDiscretizer.__name__, self.valid_strategies, self.strategy)) - for column_name in column_names: + for column_name in tqdm(column_names, desc="Computing " + "discretization bins..."): if column_name not in data.columns: log.warning("DataFrame has no column '{}', so it will be " @@ -214,7 +215,6 @@ def _fit_column(self, data: pd.DataFrame, List[tuple] list of bins as tuples """ - col_min, col_max = data[column_name].min(), data[column_name].max() if col_min == col_max: @@ -222,6 +222,22 @@ def _fit_column(self, data: pd.DataFrame, "will be ignored in computation".format(column_name)) return None + prop_inf = (np.sum(np.isinf(data[column_name])) + / data[column_name].shape[0]) + + if prop_inf > 0: + log.warning(f"Column {column_name} has " + f"{prop_inf:.1%} inf values, thus it was skipped. " + f"Consider dropping or transforming it.") + return None + + prop_nan = data[column_name].isna().sum() / data[column_name].shape[0] + + if prop_nan >= 0.99: + log.warning(f"Column {column_name} is" + f" {prop_nan:.1%}% NaNs, " + f"consider dropping or transforming it.") + n_bins = self.n_bins if self.auto_adapt_bins: size = len(data.index) @@ -266,7 +282,7 @@ def transform(self, data: pd.DataFrame, raise NotFittedError(msg.format(self.__class__.__name__)) - for column_name in column_names: + for column_name in tqdm(column_names, desc="Discretizing columns..."): if column_name not in self._bins_by_column: log.warning("Column '{}' is not in fitted output " "and will be skipped".format(column_name)) @@ -402,8 +418,22 @@ def _compute_bin_edges(self, data: pd.DataFrame, column_name: str, # bin_edges = (centers[1:] + centers[:-1]) * 0.5 # bin_edges = np.r_[col_min, bin_edges, col_max] - # Make sure the bin_edges are unique and sorted - return sorted(list(set(bin_edges))) + # nans lead to unexpected behavior during sorting, + # by replacing with inf we ensure these stay at the + # outermost edges + if math.isnan(bin_edges[0]): + bin_edges[0] = -np.inf + + if math.isnan(bin_edges[-1]): + bin_edges[-1] = np.inf + + if np.isnan(bin_edges).sum() > 0: + log.warning(f"Column {column_name} " + "has NaNs present in bin definitions") + + # Make sure the bin_edges are unique + # and order remains the same + return list(dict.fromkeys(bin_edges)) def _compute_minimal_precision_of_bin_edges(self, bin_edges: list) -> int: """Compute the minimal precision of a list of bin_edges so that we end @@ -467,7 +497,7 @@ def _compute_bins_from_edges(self, bin_edges: list) -> List[tuple]: @staticmethod def _create_index(intervals: List[tuple], - closed: str="right") -> pd.IntervalIndex: + closed: str = "right") -> pd.IntervalIndex: """Create an pd.IntervalIndex based on a list of tuples. This is basically a wrapper around pd.IntervalIndex.from_tuples However, the lower bound of the first entry in the list (the lower bin) diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index 7061d2a..0177e34 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -14,9 +14,10 @@ import inspect from datetime import datetime import time - +import math import logging -log = logging.getLogger(__name__) +from random import shuffle + # third party imports import pandas as pd from sklearn.model_selection import train_test_split @@ -27,6 +28,8 @@ from cobra.preprocessing import TargetEncoder from cobra.preprocessing import CategoricalDataProcessor +log = logging.getLogger(__name__) + class PreProcessor(BaseEstimator): @@ -56,7 +59,7 @@ class PreProcessor(BaseEstimator): def __init__(self, categorical_data_processor: CategoricalDataProcessor, discretizer: KBinsDiscretizer, target_encoder: TargetEncoder, - is_fitted: bool=False): + is_fitted: bool = False): self._categorical_data_processor = categorical_data_processor self._discretizer = discretizer @@ -66,22 +69,22 @@ def __init__(self, categorical_data_processor: CategoricalDataProcessor, @classmethod def from_params(cls, - n_bins: int=10, - strategy: str="quantile", - closed: str="right", - auto_adapt_bins: bool=False, - starting_precision: int=0, - label_format: str="{} - {}", - change_endpoint_format: bool=False, - regroup: bool=True, - regroup_name: str="Other", - keep_missing: bool=True, - category_size_threshold: int=5, - p_value_threshold: float=0.001, - scale_contingency_table: bool=True, - forced_categories: dict={}, - weight: float=0.0, - imputation_strategy: str="mean"): + n_bins: int = 10, + strategy: str = "quantile", + closed: str = "right", + auto_adapt_bins: bool = False, + starting_precision: int = 0, + label_format: str = "{} - {}", + change_endpoint_format: bool = False, + regroup: bool = True, + regroup_name: str = "Other", + keep_missing: bool = True, + category_size_threshold: int = 5, + p_value_threshold: float = 0.001, + scale_contingency_table: bool = True, + forced_categories: dict = {}, + weight: float = 0.0, + imputation_strategy: str = "mean"): """Constructor to instantiate PreProcessor from all the parameters that can be set in all its required (attribute) classes along with good default values. @@ -224,6 +227,9 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list, log.info("Starting to fit pipeline") start = time.time() + # Ensure to operate on separate copy of data + train_data = train_data.copy() + # Fit discretizer, categorical preprocessor & target encoder # Note that in order to fit target_encoder, we first have to transform # the data using the fitted discretizer & categorical_data_processor @@ -337,103 +343,55 @@ def fit_transform(self, train_data: pd.DataFrame, continuous_vars: list, @staticmethod def train_selection_validation_split(data: pd.DataFrame, - target_column_name: str, - train_prop: float=0.6, - selection_prop: float=0.2, - validation_prop: float=0.2, - stratify_split=True)->pd.DataFrame: - """Split dataset into train-selection-validation datasets and merge - them into one big DataFrame with an additional column "split" - indicating to which dataset the corresponding row belongs to. + train_prop: float = 0.6, + selection_prop: float = 0.2, + validation_prop: float = 0.2)-> pd.DataFrame: + """Adds `split` column with train/selection/validation values + to the dataset. Parameters ---------- data : pd.DataFrame Input dataset to split into train-selection and validation sets - target_column_name : str - Name of the target column train_prop : float, optional Percentage data to put in train set selection_prop : float, optional Percentage data to put in selection set validation_prop : float, optional Percentage data to put in validation set - stratify_split : bool, optional - Whether or not to stratify the train-test split Returns ------- pd.DataFrame DataFrame with additional split column """ - - if train_prop + selection_prop + validation_prop != 1.0: + if not math.isclose(train_prop + selection_prop + validation_prop, 1.0): raise ValueError("The sum of train_prop, selection_prop and " - "validation_prop cannot differ from 1.0") + "validation_prop must be 1.0.") + + if train_prop == 0.0: + raise ValueError("train_prop cannot be zero!") if selection_prop == 0.0: raise ValueError("selection_prop cannot be zero!") - column_names = list(data.columns) - - predictors = [col for col in column_names if col != target_column_name] - - # for the first split, take sum of selection & validation pct as - # test pct - test_prop = selection_prop + validation_prop - # To further split our test set into selection + validation set, - # we have to modify validation pct because we only have test_prop of - # the data available anymore for further splitting! - validation_prop_modif = validation_prop / test_prop - - X = data[predictors] - y = data[target_column_name] - - stratify = None - if stratify_split: - stratify = y - - X_train, X_test, y_train, y_test = train_test_split( - X, y, - test_size=test_prop, - random_state=42, - stratify=stratify - ) - - df_train = pd.DataFrame(X_train, columns=predictors) - df_train[target_column_name] = y_train - df_train["split"] = "train" - - # If there is no validation percentage, return train-selection sets - # only - if validation_prop == 0.0: - df_selection = pd.DataFrame(X_test, columns=predictors) - df_selection[target_column_name] = y_test - df_selection["split"] = "selection" - - return (pd.concat([df_train, df_selection]) - .reset_index(drop=True)) - - if stratify_split: - stratify = y_test - - X_sel, X_val, y_sel, y_val = train_test_split( - X_test, y_test, - test_size=validation_prop_modif, - random_state=42, - stratify=stratify - ) - - df_selection = pd.DataFrame(X_sel, columns=predictors) - df_selection[target_column_name] = y_sel - df_selection["split"] = "selection" - - df_validation = pd.DataFrame(X_val, columns=predictors) - df_validation[target_column_name] = y_val - df_validation["split"] = "validation" - - return (pd.concat([df_train, df_selection, df_validation]) - .reset_index(drop=True)) + nrows = data.shape[0] + size_train = int(train_prop * nrows) + size_select = int(selection_prop * nrows) + size_valid = int(validation_prop * nrows) + correction = nrows - (size_train+size_select+size_valid) + + split = ['train'] * size_train \ + + ['train'] * correction \ + + ['selection'] * size_select \ + + ['validation'] * size_valid + + shuffle(split) + + data['split'] = split + + return data + def serialize_pipeline(self) -> dict: """Serialize the preprocessing pipeline by writing all its required diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index 7deaa27..0351049 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -8,14 +8,15 @@ - Matthias Roels (implementation) """ import logging -log = logging.getLogger(__name__) #import numpy as np import pandas as pd - +from tqdm.auto import tqdm from sklearn.base import BaseEstimator from sklearn.exceptions import NotFittedError +log = logging.getLogger(__name__) + class TargetEncoder(BaseEstimator): @@ -144,7 +145,7 @@ def fit(self, data: pd.DataFrame, column_names: list, y = data[target_column] self._global_mean = y.sum() / y.count() - for column in column_names: + for column in tqdm(column_names, desc="Fitting target encoding..."): if column not in data.columns: log.warning("DataFrame has no column '{}', so it will be " "skipped in fitting" .format(column)) @@ -209,7 +210,7 @@ def transform(self, data: pd.DataFrame, raise NotFittedError(msg.format(self.__class__.__name__)) - for column in column_names: + for column in tqdm(column_names, desc="Applying target encoding..."): if column not in data.columns: log.warning("Unknown column '{}' will be skipped" diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index 180ea19..d8d69b1 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -44,7 +44,6 @@ This will be taken care of by the ``PreProcessor`` class, which has a scikit-lea # containing each of those values basetable = preprocessor.train_selection_validation_split( basetable, - target_column_name=target_column_name, train_prop=0.6, selection_prop=0.2, validation_prop=0.2) @@ -222,4 +221,4 @@ Additionally, we can also compute the output needed to plot the so-called Predic target_column_name=target_column_name, preprocessed_predictors=predictor_list) # Plot PIGs - plot_incidence(pig_tables, 'predictor_name', predictor_order) \ No newline at end of file + plot_incidence(pig_tables, 'predictor_name', predictor_order) \ No newline at end of file diff --git a/junit/test-results.xml b/junit/test-results.xml new file mode 100644 index 0000000..b02eb35 --- /dev/null +++ b/junit/test-results.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/material/logo.png b/material/logo.png new file mode 100644 index 0000000..f940ecd Binary files /dev/null and b/material/logo.png differ diff --git a/requirements.txt b/requirements.txt index 8ee7c41..9f3d508 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ scipy>=1.5.4 scikit-learn>=0.23.1 matplotlib>=3.3.3 seaborn>=0.11.0 +tqdm>=4.59.0 \ No newline at end of file diff --git a/tests/evaluation/test_evaluation.py b/tests/evaluation/test_evaluation.py index 0ca5861..16273f2 100644 --- a/tests/evaluation/test_evaluation.py +++ b/tests/evaluation/test_evaluation.py @@ -1,6 +1,8 @@ import pytest import pandas as pd -from cobra.evaluation import plotIncidence +import numpy as np +from cobra.evaluation import plot_incidence +from cobra.evaluation import Evaluator def mock_data(): @@ -11,6 +13,13 @@ def mock_data(): 'incidence': [0.047, 0.0434, 0.054, 0.069]} return pd.DataFrame(d) +def mock_preds(n, seed = 505): + np.random.seed(seed) + + y_true = np.random.uniform(size=n) + y_pred = np.random.uniform(size=n) + + return y_true, y_pred class TestEvaluation: @@ -18,4 +27,18 @@ def test_plot_incidence(self): data = mock_data() column_order = ['1st-4th', '5th-6th', '7th-8th'] with pytest.raises(Exception): - plotIncidence(data, 'education', column_order) + plot_incidence(data, 'education', column_order) + + def test_lift_curve_n_bins(self): + n_bins_test = [5, 10, 15, 35] + + y_true, y_pred = mock_preds(50) + + n_bins_out = [] + for n_bins in n_bins_test: + e = Evaluator(n_bins = n_bins) + out = Evaluator._compute_lift_per_bin(y_true, y_pred, e.n_bins) + lifts = out[1] + n_bins_out.append(len(lifts)) + + assert n_bins_test == n_bins_out \ No newline at end of file diff --git a/tests/preprocessing/test_categorical_data_processor.py b/tests/preprocessing/test_categorical_data_processor.py index 95ebc56..5b4ec3f 100644 --- a/tests/preprocessing/test_categorical_data_processor.py +++ b/tests/preprocessing/test_categorical_data_processor.py @@ -116,6 +116,182 @@ def test_replace_categories(self, cleaned_categories, expected): data = pd.Series(data=["c1", "c2", "c3", "c4"]) actual = (CategoricalDataProcessor - ._replace_categories(data, cleaned_categories)) + ._replace_categories(data, cleaned_categories, 'Other')) pd.testing.assert_series_equal(actual, expected) + + def test_all_cats_not_significant(self): + # Expected + e = {'categorical_var': ['A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C'], + 'target': [1, 1, 1, 1, + 0, 0, 0, 0, + 1, 0, 1, 0], + 'categorical_var_processed': ['A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C']} + + # data -> actual + d = {'categorical_var': ['A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C'], + 'target': [1, 1, 1, 1, + 0, 0, 0, 0, + 1, 0, 1, 0]} + + discrete_vars = ['categorical_var'] + target_column_name = 'target' + + data = pd.DataFrame(d, columns=['categorical_var', 'target']) + expected = pd.DataFrame(e, columns=['categorical_var', + 'target', + 'categorical_var_processed']) + + categorical_data_processor = CategoricalDataProcessor( + category_size_threshold=0, + p_value_threshold=0.0001) + + categorical_data_processor.fit(data, + discrete_vars, + target_column_name) + + actual = categorical_data_processor.transform(data, + discrete_vars) + + pd.testing.assert_frame_equal(actual, expected) + + def test_regroup_name(self): + # Expected + e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C', 'C', 'C'], + 'target': [1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, + 1, 0, 1, 0, 1, 0], + 'categorical_var_processed': [ + 'A', 'A', 'A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', 'B', 'B', + 'OTH', 'OTH', 'OTH', 'OTH', 'OTH', 'OTH']} + + # data -> actual + d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C', 'C', 'C'], + 'target': [1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, + 1, 0, 1, 0, 1, 0]} + + discrete_vars = ['categorical_var'] + target_column_name = 'target' + + data = pd.DataFrame(d, columns=['categorical_var', 'target']) + expected = pd.DataFrame(e, columns=['categorical_var', + 'target', + 'categorical_var_processed']) + + expected['categorical_var_processed'] = ( + expected['categorical_var_processed'].astype("category")) + + categorical_data_processor = CategoricalDataProcessor( + category_size_threshold=0, + regroup_name='OTH', + p_value_threshold=0.05) + + categorical_data_processor.fit(data, + discrete_vars, + target_column_name) + + actual = categorical_data_processor.transform(data, + discrete_vars) + + pd.testing.assert_frame_equal(actual, expected) + + def test_force_category(self): + # Expected + e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C', 'C', 'C'], + 'target': [1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, + 1, 0, 1, 0, 1, 0], + 'categorical_var_processed': ['A', 'A', 'A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C', 'C', 'C']} + + # data -> actual + d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C', 'C', 'C'], + 'target': [1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, + 1, 0, 1, 0, 1, 0]} + + discrete_vars = ['categorical_var'] + target_column_name = 'target' + + data = pd.DataFrame(d, columns=['categorical_var', 'target']) + expected = pd.DataFrame(e, columns=['categorical_var', + 'target', + 'categorical_var_processed']) + + expected['categorical_var_processed'] = ( + expected['categorical_var_processed'].astype("category")) + + categorical_data_processor = CategoricalDataProcessor( + category_size_threshold=0, + forced_categories={'categorical_var': ['C']}, + p_value_threshold=0.05) + + categorical_data_processor.fit(data, + discrete_vars, + target_column_name) + + actual = categorical_data_processor.transform(data, + discrete_vars) + + pd.testing.assert_frame_equal(actual, expected) + + def test_categorical_variable_is_constant(self): + # Expected + e = {'categorical_var': ['A', 'A', 'A', 'A', + 'A', 'A', 'A', 'A', + 'A', 'A', 'A', 'A'], + 'target': [1, 1, 1, 1, + 0, 0, 0, 0, + 1, 0, 1, 0], + 'categorical_var_processed': ['A', 'A', 'A', 'A', + 'A', 'A', 'A', 'A', + 'A', 'A', 'A', 'A']} + + # data -> actual + d = {'categorical_var': ['A', 'A', 'A', 'A', + 'A', 'A', 'A', 'A', + 'A', 'A', 'A', 'A'], + 'target': [1, 1, 1, 1, + 0, 0, 0, 0, + 1, 0, 1, 0]} + + discrete_vars = ['categorical_var'] + target_column_name = 'target' + + data = pd.DataFrame(d, columns=['categorical_var', 'target']) + expected = pd.DataFrame(e, columns=['categorical_var', + 'target', + 'categorical_var_processed']) + + expected['categorical_var_processed'] = ( + expected['categorical_var_processed'].astype("category")) + + categorical_data_processor = CategoricalDataProcessor( + category_size_threshold=0, + p_value_threshold=0.0001) + + categorical_data_processor.fit(data, + discrete_vars, + target_column_name) + + actual = categorical_data_processor.transform(data, + discrete_vars) + + pd.testing.assert_frame_equal(actual, expected) diff --git a/tests/preprocessing/test_kbins_discretizer.py b/tests/preprocessing/test_kbins_discretizer.py index ced0ddc..5b0aeeb 100644 --- a/tests/preprocessing/test_kbins_discretizer.py +++ b/tests/preprocessing/test_kbins_discretizer.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +import math from cobra.preprocessing.kbins_discretizer import KBinsDiscretizer @@ -14,7 +15,7 @@ def does_not_raise(): class TestKBinsDiscretizer: - ################# Test for public methods ################# + # ---------------- Test for public methods ---------------- def test_attributes_to_dict(self): discretizer = KBinsDiscretizer() @@ -118,7 +119,7 @@ def test_transform(self, scenario, expectation): actual = discretizer.transform(data, ["variable"]) pd.testing.assert_frame_equal(actual, expected) - ################# Test for private methods ################# + # ---------------- Test for private methods ---------------- @pytest.mark.parametrize("n_bins, expectation", [(1, pytest.raises(ValueError)), (10.5, pytest.raises(ValueError)), @@ -163,9 +164,12 @@ def test_transform_column(self): (10, False, # almost constant pd.DataFrame({"variable": [0] + ([1] * 100)}), + None), + (2, False, + pd.DataFrame({"variable": [5.4, 9.3, np.inf]}), None)], ids=["regular", "auto_adapt_bins", - "two bin edges"]) + "two bin edges", "infs"]) def test_fit_column(self, n_bins, auto_adapt_bins, data, expected): discretizer = KBinsDiscretizer(n_bins=n_bins, auto_adapt_bins=auto_adapt_bins) @@ -218,7 +222,9 @@ def test_compute_minimal_precision_of_bin_edges(self, bin_edges, @pytest.mark.parametrize("bin_edges, expected", [([0, 1, 1.5, 2], [(0, 1), (1, 1.5), (1.5, 2)]), - ([0, 1, 1.5, 3], [(0, 1), (1, 2), (2, 3)])]) + ([0, 1, 1.5, 3], [(0, 1), (1, 2), (2, 3)]), + ([np.inf, 0.0, -np.inf], + [(np.inf, 0.0), (0.0, -np.inf)])]) def test_compute_bins_from_edges(self, bin_edges, expected): discretizer = KBinsDiscretizer() diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py index f529252..80f6d73 100644 --- a/tests/preprocessing/test_preprocessor.py +++ b/tests/preprocessing/test_preprocessor.py @@ -16,13 +16,19 @@ def does_not_raise(): class TestPreProcessor: - @pytest.mark.parametrize(("train_prop, selection_prop, " - "validation_prop, expected_sizes"), + @pytest.mark.parametrize("train_prop, selection_prop, validation_prop, " + "expected_sizes", [(0.6, 0.2, 0.2, {"train": 6, "selection": 2, "validation": 2}), (0.7, 0.3, 0.0, {"train": 7, - "selection": 3})]) + "selection": 3}), + # Error "The sum of train_prop, selection_prop and + # validation_prop must be 1.0." should not be + # raised: + (0.7, 0.2, 0.1, {"train": 7, + "selection": 2, + "validation": 1})]) def test_train_selection_validation_split(self, train_prop: float, selection_prop: float, validation_prop: float, @@ -31,19 +37,13 @@ def test_train_selection_validation_split(self, train_prop: float, data = pd.DataFrame(X, columns=[f"c{i+1}" for i in range(10)]) data.loc[:, "target"] = np.array([0] * 7 + [1] * 3) - # No stratified split here because sample size is to low to make - # it work. This feature is already well-tested in scikit-learn and - # needs no further testing here actual = PreProcessor.train_selection_validation_split(data, - "target", train_prop, selection_prop, - validation_prop, - False) + validation_prop) # check for the output schema - expected_schema = list(data.columns) + ["split"] - assert list(actual.columns) == expected_schema + assert list(actual.columns) == list(data.columns) # check that total size of input & output is the same! assert len(actual.index) == len(data.index) @@ -56,7 +56,7 @@ def test_train_selection_validation_split(self, train_prop: float, def test_train_selection_validation_split_error_wrong_prop(self): error_msg = ("The sum of train_prop, selection_prop and " - "validation_prop cannot differ from 1.0") + "validation_prop must be 1.0.") train_prop = 0.7 selection_prop = 0.3 @@ -79,10 +79,9 @@ def _test_train_selection_validation_split_error(self, selection_prop: float, error_msg: str): df = pd.DataFrame() - cname = "" with pytest.raises(ValueError, match=error_msg): (PreProcessor - .train_selection_validation_split(df, cname, + .train_selection_validation_split(df, train_prop=train_prop, selection_prop=selection_prop, validation_prop=0.1))