Skip to content

Commit

Permalink
Merge pull request #145 from PythonPredictions/137-automatically-sear…
Browse files Browse the repository at this point in the history
…ch-for-categorical-variables

137 automatically search for categorical variables
  • Loading branch information
patrickleonardy authored Jan 16, 2023
2 parents 8f770c7 + 2d610d7 commit 1163861
Show file tree
Hide file tree
Showing 4 changed files with 167 additions and 10 deletions.
84 changes: 77 additions & 7 deletions cobra/preprocessing/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,28 +223,87 @@ def from_pipeline(cls, pipeline: dict):
target_encoder,
is_fitted=pipeline["_is_fitted"],
)

def get_continous_and_discreate_columns(
self,
df : pd.DataFrame,
id_col_name : str,
target_column_name :str
) -> tuple:
"""Filters out the continious and discreate varaibles out of a dataframe and returns a tuple containing lists of column names
It assumes that numerical comumns with less than or equal to 10 different values are categorical
Parameters
----------
df : pd.DataFrame
DataFrame that you want to divide in discreate and continous variables
id_col_name : str
column name of the id column, can be None
target_column_name : str
column name of the target column
Returns
-------
tuple
tuple containing 2 lists of column names. (continuous_vars, discrete_vars)
"""
if id_col_name == None:
log.warning("id_col_name is equal to None. If there is no id column ignore this warning")

# find continuous_vars and discrete_vars in the dateframe
col_dtypes = df.dtypes
discrete_vars = [col for col in col_dtypes[col_dtypes==object].index.tolist() if col not in [id_col_name, target_column_name]]


for col in df.columns:
if col not in discrete_vars and col not in [id_col_name, target_column_name]: # omit discrete because a string, and target
val_counts = df[col].nunique()
if val_counts > 1 and val_counts <= 10: # the column contains less than 10 different values
discrete_vars.append(col)

continuous_vars = list(set(df.columns)
- set(discrete_vars)
- set([id_col_name, target_column_name]))
log.warning(
f"""Cobra automaticaly assumes that following variables are
discrete: {discrete_vars}
continuous: {continuous_vars}
If you want to change this behaviour you can specify the discrete/continuous variables yourself with the continuous_vars and discrete_vars keywords. \nIt assumes that numerical comumns with less than or equal to 10 different values are categorical"""
)
return continuous_vars, discrete_vars

def fit(
self,
train_data: pd.DataFrame,
continuous_vars: list,
discrete_vars: list,
target_column_name: str,
id_col_name: str = None
):
"""Fit the data to the preprocessing pipeline.
If you put continious_vars and target_vars equal to `None` and give the id_col_name Cobra will guess which varaibles are continious and which are not
Parameters
----------
train_data : pd.DataFrame
Data to be preprocessed.
continuous_vars : list
List of continuous variables.
discrete_vars : list
List of discrete variables.
continuous_vars : list | None
List of continuous variables, can be None.
discrete_vars : list | None
List of discrete variables, can be None.
target_column_name : str
Column name of the target.
id_col_name : str, optional
_description_, by default None
"""
if not (continuous_vars and discrete_vars):
continuous_vars, discrete_vars = self.get_continous_and_discreate_columns(
df=train_data,
id_col_name=id_col_name,
target_column_name=target_column_name

)

# get list of all variables
preprocessed_variable_names = PreProcessor._get_variable_list(
continuous_vars, discrete_vars
Expand Down Expand Up @@ -359,27 +418,38 @@ def fit_transform(
continuous_vars: list,
discrete_vars: list,
target_column_name: str,
id_col_name: str = None
) -> pd.DataFrame:

"""Fit preprocessing pipeline and transform the data.
If you put continious_vars and target_vars equal to `None` and give the id_col_name Cobra will guess which varaibles are continious and which are not
Parameters
----------
train_data : pd.DataFrame
Data to be preprocessed
continuous_vars : list
List of continuous variables.
List of continuous variables, can be None.
discrete_vars : list
List of discrete variables.
List of discrete variables, can be None.
target_column_name : str
Column name of the target.
id_col_name : str, optional
_description_, by default None
Returns
-------
pd.DataFrame
Transformed (preprocessed) data.
"""
if not (continuous_vars and discrete_vars) and id_col_name:
continuous_vars, discrete_vars = self.get_continous_and_discreate_columns(
df=train_data,
id_col_name=id_col_name,
target_column_name=target_column_name

self.fit(train_data, continuous_vars, discrete_vars, target_column_name)
)
self.fit(train_data, continuous_vars, discrete_vars, target_column_name, id_col_name)

return self.transform(train_data, continuous_vars, discrete_vars)

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
numpy>=1.19.4
pandas>=1.1.5
scipy>=1.5.4
scikit-learn>=0.24.1
scikit-learn>=1.2.0
matplotlib>=3.4.3
seaborn>=0.11.0
tqdm>=4.62.2
2 changes: 0 additions & 2 deletions tests/model_building/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,6 @@ def test_serialize(self):
"copy_X": True,
"fit_intercept": True,
"n_jobs": None,
"normalize": "deprecated",
"positive": False
}
}
Expand All @@ -244,7 +243,6 @@ def test_deserialize(self):
"copy_X": True,
"fit_intercept": True,
"n_jobs": None,
"normalize": "deprecated",
"positive": False
},
"coef_": [[0.5, 0.75]],
Expand Down
89 changes: 89 additions & 0 deletions tests/preprocessing/test_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,95 @@ def test_get_variable_list(

assert actual == expected

@pytest.mark.parametrize(
("input, expected"),
[
# example 1
(
pd.DataFrame({
"ID": list(range(20)),
"A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8],
"B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5,
"C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17],
"Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5
}
),
pd.DataFrame({
'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17],
'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'],
'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0],
'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5],
'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0]
}
),
)
]
)
def test_fit_transform_without_id_col_name(self, input, expected):

preprocessor = PreProcessor.from_params(model_type="classification")

continuous_vars, discrete_vars = preprocessor.get_continous_and_discreate_columns(input, "ID","Target")

calculated = preprocessor.fit_transform(
input,
continuous_vars=continuous_vars,
discrete_vars=discrete_vars,
target_column_name="Target"
)
pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False)

@pytest.mark.parametrize(
("input, expected"),
[
# example 1
(
pd.DataFrame({
"ID": list(range(20)),
"A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8],
"B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5,
"C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17],
"Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5
}
),
pd.DataFrame({
'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17],
'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'],
'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0],
'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5],
'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0]
}
),
)
]
)
def test_fit_transform_with_id_col_name(self, input, expected):

preprocessor = PreProcessor.from_params(model_type="classification")

# continuous_vars, discrete_vars = preprocessor.get_continous_and_discreate_columns(input, "ID","Target")

calculated = preprocessor.fit_transform(
input,
continuous_vars=None,
discrete_vars=None,
target_column_name="Target",
id_col_name="ID"
)
pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False)

@staticmethod
def mock_transform(df: pd.DataFrame, args):
"""Mock the transform method."""
Expand Down

0 comments on commit 1163861

Please sign in to comment.