Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

issue 137/PR 145 automatically search for categorical variables #145

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 77 additions & 7 deletions cobra/preprocessing/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,28 +223,87 @@ def from_pipeline(cls, pipeline: dict):
target_encoder,
is_fitted=pipeline["_is_fitted"],
)

def get_continous_and_discreate_columns(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get_continuous_and_discrete_columns(

self,
df : pd.DataFrame,
id_col_name : str,
target_column_name :str
) -> tuple:
"""Filters out the continious and discreate varaibles out of a dataframe and returns a tuple containing lists of column names
It assumes that numerical comumns with less than or equal to 10 different values are categorical
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

continuous instead of continious
discrete instead of discrete
variables instead of varaibles
columns instead of columns


Parameters
----------
df : pd.DataFrame
DataFrame that you want to divide in discreate and continous variables
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typos

id_col_name : str
column name of the id column, can be None
target_column_name : str
column name of the target column

Returns
-------
tuple
tuple containing 2 lists of column names. (continuous_vars, discrete_vars)
"""
if id_col_name == None:
log.warning("id_col_name is equal to None. If there is no id column ignore this warning")

# find continuous_vars and discrete_vars in the dateframe
col_dtypes = df.dtypes
discrete_vars = [col for col in col_dtypes[col_dtypes==object].index.tolist() if col not in [id_col_name, target_column_name]]


for col in df.columns:
if col not in discrete_vars and col not in [id_col_name, target_column_name]: # omit discrete because a string, and target
val_counts = df[col].nunique()
if val_counts > 1 and val_counts <= 10: # the column contains less than 10 different values
discrete_vars.append(col)

continuous_vars = list(set(df.columns)
- set(discrete_vars)
- set([id_col_name, target_column_name]))
log.warning(
f"""Cobra automaticaly assumes that following variables are
discrete: {discrete_vars}
continuous: {continuous_vars}
If you want to change this behaviour you can specify the discrete/continuous variables yourself with the continuous_vars and discrete_vars keywords. \nIt assumes that numerical comumns with less than or equal to 10 different values are categorical"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo comumns -> columns

)
return continuous_vars, discrete_vars

def fit(
self,
train_data: pd.DataFrame,
continuous_vars: list,
discrete_vars: list,
target_column_name: str,
id_col_name: str = None
):
"""Fit the data to the preprocessing pipeline.
If you put continious_vars and target_vars equal to `None` and give the id_col_name Cobra will guess which varaibles are continious and which are not
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typos:
continuous, variables, continuous


Parameters
----------
train_data : pd.DataFrame
Data to be preprocessed.
continuous_vars : list
List of continuous variables.
discrete_vars : list
List of discrete variables.
continuous_vars : list | None
List of continuous variables, can be None.
discrete_vars : list | None
List of discrete variables, can be None.
target_column_name : str
Column name of the target.
id_col_name : str, optional
_description_, by default None
"""
if not (continuous_vars and discrete_vars):
continuous_vars, discrete_vars = self.get_continous_and_discreate_columns(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typos in code can happen, but please always double-check your method names for typos, since this snowballs to the end user eventually.
get_continuous_and_discrete_columns(...)

df=train_data,
id_col_name=id_col_name,
target_column_name=target_column_name

)

# get list of all variables
preprocessed_variable_names = PreProcessor._get_variable_list(
continuous_vars, discrete_vars
Expand Down Expand Up @@ -359,27 +418,38 @@ def fit_transform(
continuous_vars: list,
discrete_vars: list,
target_column_name: str,
id_col_name: str = None
) -> pd.DataFrame:

"""Fit preprocessing pipeline and transform the data.
If you put continious_vars and target_vars equal to `None` and give the id_col_name Cobra will guess which varaibles are continious and which are not
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typos


Parameters
----------
train_data : pd.DataFrame
Data to be preprocessed
continuous_vars : list
List of continuous variables.
List of continuous variables, can be None.
discrete_vars : list
List of discrete variables.
List of discrete variables, can be None.
target_column_name : str
Column name of the target.
id_col_name : str, optional
_description_, by default None

Returns
-------
pd.DataFrame
Transformed (preprocessed) data.
"""
if not (continuous_vars and discrete_vars) and id_col_name:
continuous_vars, discrete_vars = self.get_continous_and_discreate_columns(
df=train_data,
id_col_name=id_col_name,
target_column_name=target_column_name

self.fit(train_data, continuous_vars, discrete_vars, target_column_name)
)
self.fit(train_data, continuous_vars, discrete_vars, target_column_name, id_col_name)

return self.transform(train_data, continuous_vars, discrete_vars)

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
numpy>=1.19.4
pandas>=1.1.5
scipy>=1.5.4
scikit-learn>=0.24.1
scikit-learn>=1.2.0
matplotlib>=3.4.3
seaborn>=0.11.0
tqdm>=4.62.2
2 changes: 0 additions & 2 deletions tests/model_building/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,6 @@ def test_serialize(self):
"copy_X": True,
"fit_intercept": True,
"n_jobs": None,
"normalize": "deprecated",
"positive": False
}
}
Expand All @@ -244,7 +243,6 @@ def test_deserialize(self):
"copy_X": True,
"fit_intercept": True,
"n_jobs": None,
"normalize": "deprecated",
"positive": False
},
"coef_": [[0.5, 0.75]],
Expand Down
89 changes: 89 additions & 0 deletions tests/preprocessing/test_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,95 @@ def test_get_variable_list(

assert actual == expected

@pytest.mark.parametrize(
("input, expected"),
[
# example 1
(
pd.DataFrame({
"ID": list(range(20)),
"A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8],
"B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5,
"C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17],
"Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5
}
),
pd.DataFrame({
'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17],
'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'],
'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0],
'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5],
'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0]
}
),
)
]
)
def test_fit_transform_without_id_col_name(self, input, expected):

preprocessor = PreProcessor.from_params(model_type="classification")

continuous_vars, discrete_vars = preprocessor.get_continous_and_discreate_columns(input, "ID","Target")

calculated = preprocessor.fit_transform(
input,
continuous_vars=continuous_vars,
discrete_vars=discrete_vars,
target_column_name="Target"
)
pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False)

@pytest.mark.parametrize(
("input, expected"),
[
# example 1
(
pd.DataFrame({
"ID": list(range(20)),
"A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8],
"B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5,
"C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17],
"Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5
}
),
pd.DataFrame({
'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17],
'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'],
'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0],
'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5],
'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0]
}
),
)
]
)
def test_fit_transform_with_id_col_name(self, input, expected):

preprocessor = PreProcessor.from_params(model_type="classification")

# continuous_vars, discrete_vars = preprocessor.get_continous_and_discreate_columns(input, "ID","Target")

calculated = preprocessor.fit_transform(
input,
continuous_vars=None,
discrete_vars=None,
target_column_name="Target",
id_col_name="ID"
)
pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False)

@staticmethod
def mock_transform(df: pd.DataFrame, args):
"""Mock the transform method."""
Expand Down