-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
issue 137/PR 145 automatically search for categorical variables #145
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -223,28 +223,87 @@ def from_pipeline(cls, pipeline: dict): | |
target_encoder, | ||
is_fitted=pipeline["_is_fitted"], | ||
) | ||
|
||
def get_continous_and_discreate_columns( | ||
self, | ||
df : pd.DataFrame, | ||
id_col_name : str, | ||
target_column_name :str | ||
) -> tuple: | ||
"""Filters out the continious and discreate varaibles out of a dataframe and returns a tuple containing lists of column names | ||
It assumes that numerical comumns with less than or equal to 10 different values are categorical | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. continuous instead of continious |
||
|
||
Parameters | ||
---------- | ||
df : pd.DataFrame | ||
DataFrame that you want to divide in discreate and continous variables | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typos |
||
id_col_name : str | ||
column name of the id column, can be None | ||
target_column_name : str | ||
column name of the target column | ||
|
||
Returns | ||
------- | ||
tuple | ||
tuple containing 2 lists of column names. (continuous_vars, discrete_vars) | ||
""" | ||
if id_col_name == None: | ||
log.warning("id_col_name is equal to None. If there is no id column ignore this warning") | ||
|
||
# find continuous_vars and discrete_vars in the dateframe | ||
col_dtypes = df.dtypes | ||
discrete_vars = [col for col in col_dtypes[col_dtypes==object].index.tolist() if col not in [id_col_name, target_column_name]] | ||
|
||
|
||
for col in df.columns: | ||
if col not in discrete_vars and col not in [id_col_name, target_column_name]: # omit discrete because a string, and target | ||
val_counts = df[col].nunique() | ||
if val_counts > 1 and val_counts <= 10: # the column contains less than 10 different values | ||
discrete_vars.append(col) | ||
|
||
continuous_vars = list(set(df.columns) | ||
- set(discrete_vars) | ||
- set([id_col_name, target_column_name])) | ||
log.warning( | ||
f"""Cobra automaticaly assumes that following variables are | ||
discrete: {discrete_vars} | ||
continuous: {continuous_vars} | ||
If you want to change this behaviour you can specify the discrete/continuous variables yourself with the continuous_vars and discrete_vars keywords. \nIt assumes that numerical comumns with less than or equal to 10 different values are categorical""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo comumns -> columns |
||
) | ||
return continuous_vars, discrete_vars | ||
|
||
def fit( | ||
self, | ||
train_data: pd.DataFrame, | ||
continuous_vars: list, | ||
discrete_vars: list, | ||
target_column_name: str, | ||
id_col_name: str = None | ||
): | ||
"""Fit the data to the preprocessing pipeline. | ||
If you put continious_vars and target_vars equal to `None` and give the id_col_name Cobra will guess which varaibles are continious and which are not | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typos: |
||
|
||
Parameters | ||
---------- | ||
train_data : pd.DataFrame | ||
Data to be preprocessed. | ||
continuous_vars : list | ||
List of continuous variables. | ||
discrete_vars : list | ||
List of discrete variables. | ||
continuous_vars : list | None | ||
List of continuous variables, can be None. | ||
discrete_vars : list | None | ||
List of discrete variables, can be None. | ||
target_column_name : str | ||
Column name of the target. | ||
id_col_name : str, optional | ||
_description_, by default None | ||
""" | ||
if not (continuous_vars and discrete_vars): | ||
continuous_vars, discrete_vars = self.get_continous_and_discreate_columns( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Typos in code can happen, but please always double-check your method names for typos, since this snowballs to the end user eventually. |
||
df=train_data, | ||
id_col_name=id_col_name, | ||
target_column_name=target_column_name | ||
|
||
) | ||
|
||
# get list of all variables | ||
preprocessed_variable_names = PreProcessor._get_variable_list( | ||
continuous_vars, discrete_vars | ||
|
@@ -359,27 +418,38 @@ def fit_transform( | |
continuous_vars: list, | ||
discrete_vars: list, | ||
target_column_name: str, | ||
id_col_name: str = None | ||
) -> pd.DataFrame: | ||
|
||
"""Fit preprocessing pipeline and transform the data. | ||
If you put continious_vars and target_vars equal to `None` and give the id_col_name Cobra will guess which varaibles are continious and which are not | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typos |
||
|
||
Parameters | ||
---------- | ||
train_data : pd.DataFrame | ||
Data to be preprocessed | ||
continuous_vars : list | ||
List of continuous variables. | ||
List of continuous variables, can be None. | ||
discrete_vars : list | ||
List of discrete variables. | ||
List of discrete variables, can be None. | ||
target_column_name : str | ||
Column name of the target. | ||
id_col_name : str, optional | ||
_description_, by default None | ||
|
||
Returns | ||
------- | ||
pd.DataFrame | ||
Transformed (preprocessed) data. | ||
""" | ||
if not (continuous_vars and discrete_vars) and id_col_name: | ||
continuous_vars, discrete_vars = self.get_continous_and_discreate_columns( | ||
df=train_data, | ||
id_col_name=id_col_name, | ||
target_column_name=target_column_name | ||
|
||
self.fit(train_data, continuous_vars, discrete_vars, target_column_name) | ||
) | ||
self.fit(train_data, continuous_vars, discrete_vars, target_column_name, id_col_name) | ||
|
||
return self.transform(train_data, continuous_vars, discrete_vars) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
numpy>=1.19.4 | ||
pandas>=1.1.5 | ||
scipy>=1.5.4 | ||
scikit-learn>=0.24.1 | ||
scikit-learn>=1.2.0 | ||
matplotlib>=3.4.3 | ||
seaborn>=0.11.0 | ||
tqdm>=4.62.2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
get_continuous_and_discrete_columns(