Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

handle nans in categorical columns #118

Merged
merged 5 commits into from
Mar 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 41 additions & 38 deletions autoPyTorch/data/tabular_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import sklearn.utils
from sklearn import preprocessing
from sklearn.base import BaseEstimator
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.exceptions import NotFittedError

from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES
Expand Down Expand Up @@ -53,16 +53,34 @@ def _fit(
for column in X.columns:
if X[column].isna().all():
X[column] = pd.to_numeric(X[column])
# Also note this change in self.dtypes
if len(self.dtypes) != 0:
self.dtypes[list(X.columns).index(column)] = X[column].dtype

self.enc_columns, self.feat_type = self._get_columns_to_encode(X)

if len(self.enc_columns) > 0:

self.encoder = make_column_transformer(
(preprocessing.OrdinalEncoder(
handle_unknown='use_encoded_value',
unknown_value=-1,
), self.enc_columns),
# impute missing values before encoding,
# remove once sklearn natively supports
# it in ordinal encoding. Sklearn issue:
# "https://github.com/scikit-learn/scikit-learn/issues/17123)"
for column in self.enc_columns:
if X[column].isna().any():
missing_value: typing.Union[int, str] = -1
# make sure for a string column we give
# string missing value else we give numeric
if type(X[column][0]) == str:
missing_value = str(missing_value)
X[column] = X[column].cat.add_categories([missing_value])
X[column] = X[column].fillna(missing_value)

self.encoder = ColumnTransformer(
[
("encoder",
preprocessing.OrdinalEncoder(
handle_unknown='use_encoded_value',
unknown_value=-1,
), self.enc_columns)],
remainder="passthrough"
)

Expand All @@ -85,6 +103,7 @@ def comparator(cmp1: str, cmp2: str) -> int:
return 1
else:
raise ValueError((cmp1, cmp2))

self.feat_type = sorted(
self.feat_type,
key=functools.cmp_to_key(comparator)
Expand Down Expand Up @@ -182,9 +201,8 @@ def _check_data(
if not isinstance(X, (np.ndarray, pd.DataFrame)) and not scipy.sparse.issparse(X):
raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
" scipy sparse and Python Lists, yet, the provided input is"
" of type {}".format(
type(X)
))
" of type {}".format(type(X))
)

if self.data_type is None:
self.data_type = type(X)
Expand Down Expand Up @@ -217,39 +235,25 @@ def _check_data(
# per estimator
enc_columns, _ = self._get_columns_to_encode(X)

if len(enc_columns) > 0:
if np.any(pd.isnull(
X[enc_columns].dropna( # type: ignore[call-overload]
axis='columns', how='all')
)):
# Ignore all NaN columns, and if still a NaN
# Error out
raise ValueError("Categorical features in a dataframe cannot contain "
"missing/NaN values. The OrdinalEncoder used by "
"AutoPyTorch cannot handle this yet (due to a "
"limitation on scikit-learn being addressed via: "
"https://github.com/scikit-learn/scikit-learn/issues/17123)"
)
column_order = [column for column in X.columns]
if len(self.column_order) > 0:
if self.column_order != column_order:
raise ValueError("Changing the column order of the features after fit() is "
"not supported. Fit() method was called with "
"{} whereas the new features have {} as type".format(
self.column_order,
column_order,
))
"{} whereas the new features have {} as type".format(self.column_order,
column_order,)
)
else:
self.column_order = column_order
dtypes = [dtype.name for dtype in X.dtypes]
if len(self.dtypes) > 0:
if self.dtypes != dtypes:
raise ValueError("Changing the dtype of the features after fit() is "
"not supported. Fit() method was called with "
"{} whereas the new features have {} as type".format(
self.dtypes,
dtypes,
))
"{} whereas the new features have {} as type".format(self.dtypes,
dtypes,
)
)
else:
self.dtypes = dtypes

Expand Down Expand Up @@ -294,7 +298,8 @@ def _get_columns_to_encode(
"pandas.Series.astype ."
"If working with string objects, the following "
"tutorial illustrates how to work with text data: "
"https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format( # noqa: E501
"https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format(
# noqa: E501
column,
)
)
Expand Down Expand Up @@ -349,15 +354,13 @@ def list_to_dataframe(
# If a list was provided, it will be converted to pandas
X_train = pd.DataFrame(data=X_train).infer_objects()
self.logger.warning("The provided feature types to AutoPyTorch are of type list."
"Features have been interpreted as: {}".format(
[(col, t) for col, t in zip(X_train.columns, X_train.dtypes)]
))
"Features have been interpreted as: {}".format([(col, t) for col, t in
zip(X_train.columns, X_train.dtypes)]))
if X_test is not None:
if not isinstance(X_test, list):
self.logger.warning("Train features are a list while the provided test data"
"is {}. X_test will be casted as DataFrame.".format(
type(X_test)
))
"is {}. X_test will be casted as DataFrame.".format(type(X_test))
)
X_test = pd.DataFrame(data=X_test).infer_objects()
return X_train, X_test

Expand Down
13 changes: 10 additions & 3 deletions test/test_data/test_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,10 +231,17 @@ def test_featurevalidator_unsupported_numpy(input_data_featuretest):
),
indirect=True
)
def test_featurevalidator_unsupported_pandas(input_data_featuretest):
def test_featurevalidator_categorical_nan(input_data_featuretest):
validator = TabularFeatureValidator()
with pytest.raises(ValueError, match=r"Categorical features in a dataframe.*missing/NaN"):
validator.fit(input_data_featuretest)
validator.fit(input_data_featuretest)
transformed_X = validator.transform(input_data_featuretest)
assert any(pd.isna(input_data_featuretest))
assert any((-1 in categories) or ('-1' in categories) for categories in
validator.encoder.named_transformers_['encoder'].categories_)
assert np.shape(input_data_featuretest) == np.shape(transformed_X)
assert np.issubdtype(transformed_X.dtype, np.number)
assert validator._is_fitted
assert isinstance(transformed_X, np.ndarray)
Copy link
Contributor

@franchuterivera franchuterivera Mar 1, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a check so that we see that a nan was encoded?

LIke, can you add a check that:

  • makes sure that TabularFeatureValidator encoder categories consider the -1 and input_data_featuretest has a nan?



@pytest.mark.parametrize(
Expand Down