diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index bcdb9ada15bb3..60342f1b6cba5 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -480,6 +480,49 @@ This function is often used along with discretization functions like ``cut``: See also :func:`Series.str.get_dummies `. +.. versionadded:: 0.15.0 + +:func:`get_dummies` also accepts a DataFrame. By default all categorical +variables (categorical in the statistical sense, +those with `object` or `categorical` dtype) are encoded as dummy variables. + + +.. ipython:: python + + df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['c', 'c', 'b'], + 'C': [1, 2, 3]}) + pd.get_dummies(df) + +All non-object columns are included untouched in the output. + +You can control the columns that are encoded with the ``columns`` keyword. + +.. ipython:: python + + pd.get_dummies(df, columns=['A']) + +Notice that the ``B`` column is still included in the output, it just hasn't +been encoded. You can drop ``B`` before calling ``get_dummies`` if you don't +want to include it in the output. + +As with the Series version, you can pass values for the ``prefix`` and +``prefix_sep``. By default the column name is used as the prefix, and '_' as +the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways + +- string: Use the same value for ``prefix`` or ``prefix_sep`` for each column + to be encoded +- list: Must be the same length as the number of columns being encoded. +- dict: Mapping column name to prefix + +.. ipython:: python + + simple = pd.get_dummies(df, prefix='new_prefix') + simple + from_list = pd.get_dummies(df, prefix=['from_A', 'from_B']) + from_list + from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'}) + from_dict + Factorizing values ------------------ diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 6e2d1d974429b..a297fb6ad32ce 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -417,6 +417,9 @@ There are no prior version deprecations that are taking effect as of 0.15.0. Deprecations ~~~~~~~~~~~~ +The ``convert_dummies`` method has been deprecated in favor of +``get_dummies``(:issue:`8140`) + .. _whatsnew_0150.knownissues: Known Issues @@ -461,7 +464,15 @@ Enhancements +- The ``get_dummies`` method can now be used on DataFrames. By default only +catagorical columns are encoded as 0's and 1's, while other columns are +left untouched. + + .. ipython:: python + df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['c', 'c', 'b'], + 'C': [1, 2, 3]}) + pd.get_dummies(df) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index f2817e04819bb..3ba589b8fa35d 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -979,27 +979,42 @@ def convert_dummies(data, cat_variables, prefix_sep='_'): ------- dummies : DataFrame """ + import warnings + + warnings.warn("'convert_dummies' is deprecated and will be removed " + "in a future release. Use 'get_dummies' instead.", + FutureWarning) + result = data.drop(cat_variables, axis=1) for variable in cat_variables: - dummies = get_dummies(data[variable], prefix=variable, - prefix_sep=prefix_sep) + dummies = _get_dummies_1d(data[variable], prefix=variable, + prefix_sep=prefix_sep) result = result.join(dummies) return result -def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False): +def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, + columns=None): """ Convert categorical variable into dummy/indicator variables Parameters ---------- - data : array-like or Series - prefix : string, default None + data : array-like, Series, or DataFrame + prefix : string, list of strings, or dict of strings, default None String to append DataFrame column names + Pass a list with length equal to the number of columns + when calling get_dummies on a DataFrame. Alternativly, `prefix` + can be a dictionary mapping column names to prefixes. prefix_sep : string, default '_' - If appending prefix, separator/delimiter to use + If appending prefix, separator/delimiter to use. Or pass a + list or dictionary as with `prefix.` dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. + columns : list-like, default None + Column names in the DataFrame to be encoded. + If `columns` is None then all the columns with + `object` or `category` dtype will be converted. Returns ------- @@ -1031,9 +1046,71 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False): 1 0 1 0 2 0 0 1 + >>> df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], + 'C': [1, 2, 3]}) + + >>> get_dummies(df, prefix=['col1', 'col2']): + C col1_a col1_b col2_a col2_b col2_c + 0 1 1 0 0 1 0 + 1 2 0 1 1 0 0 + 2 3 1 0 0 0 1 + See also ``Series.str.get_dummies``. """ + from pandas.tools.merge import concat + from itertools import cycle + + if isinstance(data, DataFrame): + # determine columns being encoded + + if columns is None: + columns_to_encode = data.select_dtypes(include=['object', + 'category']).columns + else: + columns_to_encode = columns + + # validate prefixes and separator to avoid silently dropping cols + def check_len(item, name): + length_msg = ("Length of '{0}' ({1}) did " + "not match the length of the columns " + "being encoded ({2}).") + + if com.is_list_like(item): + if not len(item) == len(columns_to_encode): + raise ValueError(length_msg.format(name, len(item), + len(columns_to_encode))) + + check_len(prefix, 'prefix') + check_len(prefix_sep, 'prefix_sep') + if isinstance(prefix, compat.string_types): + prefix = cycle([prefix]) + if isinstance(prefix, dict): + prefix = [prefix[col] for col in columns_to_encode] + + if prefix is None: + prefix = columns_to_encode + + # validate separators + if isinstance(prefix_sep, compat.string_types): + prefix_sep = cycle([prefix_sep]) + elif isinstance(prefix_sep, dict): + prefix_sep = [prefix_sep[col] for col in columns_to_encode] + + result = data.drop(columns_to_encode, axis=1) + with_dummies = [result] + for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep): + + dummy = _get_dummies_1d(data[col], prefix=pre, + prefix_sep=sep, dummy_na=dummy_na) + with_dummies.append(dummy) + result = concat(with_dummies, axis=1) + else: + result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na) + return result + + +def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data)) levels = cat.levels diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 42427617991af..3cc2d94789a8d 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -149,6 +149,11 @@ def test_multiindex(self): class TestGetDummies(tm.TestCase): + + def setUp(self): + self.df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], + 'C': [1, 2, 3]}) + def test_basic(self): s_list = list('abc') s_series = Series(s_list) @@ -209,6 +214,114 @@ def test_unicode(self): # See GH 6885 - get_dummies chokes on unicode values u('letter_%s') % eacute: {0: 0.0, 1: 1.0, 2: 1.0}}) assert_frame_equal(res, exp) + def test_dataframe_dummies_all_obj(self): + df = self.df[['A', 'B']] + result = get_dummies(df) + expected = DataFrame({'A_a': [1., 0, 1], 'A_b': [0., 1, 0], + 'B_b': [1., 1, 0], 'B_c': [0., 0, 1]}) + assert_frame_equal(result, expected) + + def test_dataframe_dummies_mix_default(self): + df = self.df + result = get_dummies(df) + expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1], + 'A_b': [0., 1, 0], 'B_b': [1., 1, 0], + 'B_c': [0., 0, 1]}) + expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] + assert_frame_equal(result, expected) + + def test_dataframe_dummies_prefix_list(self): + prefixes = ['from_A', 'from_B'] + df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], + 'C': [1, 2, 3]}) + result = get_dummies(df, prefix=prefixes) + expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1., 0, 1], + 'from_A_b': [0., 1, 0], 'from_B_b': [1., 1, 0], + 'from_B_c': [0., 0, 1]}) + expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b', + 'from_B_c']] + assert_frame_equal(result, expected) + + def test_datafrmae_dummies_prefix_str(self): + # not that you should do this... + df = self.df + result = get_dummies(df, prefix='bad') + expected = DataFrame([[1, 1., 0., 1., 0.], + [2, 0., 1., 1., 0.], + [3, 1., 0., 0., 1.]], + columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c']) + assert_frame_equal(result, expected) + + def test_dataframe_dummies_subset(self): + df = self.df + result = get_dummies(df, prefix=['from_A'], + columns=['A']) + expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0], + 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) + assert_frame_equal(result, expected) + + def test_dataframe_dummies_prefix_sep(self): + df = self.df + result = get_dummies(df, prefix_sep='..') + expected = DataFrame({'C': [1, 2, 3], 'A..a': [1., 0, 1], + 'A..b': [0., 1, 0], 'B..b': [1., 1, 0], + 'B..c': [0., 0, 1]}) + expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] + assert_frame_equal(result, expected) + + result = get_dummies(df, prefix_sep=['..', '__']) + expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'}) + assert_frame_equal(result, expected) + + result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}) + assert_frame_equal(result, expected) + + def test_dataframe_dummies_prefix_bad_length(self): + with tm.assertRaises(ValueError): + get_dummies(self.df, prefix=['too few']) + + def test_dataframe_dummies_prefix_sep_bad_length(self): + with tm.assertRaises(ValueError): + get_dummies(self.df, prefix_sep=['bad']) + + def test_dataframe_dummies_prefix_dict(self): + prefixes = {'A': 'from_A', 'B': 'from_B'} + df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], + 'C': [1, 2, 3]}) + result = get_dummies(df, prefix=prefixes) + expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0], + 'from_B_b': [1., 1, 0], 'from_B_c': [0., 0, 1], + 'C': [1, 2, 3]}) + assert_frame_equal(result, expected) + + def test_dataframe_dummies_with_na(self): + df = self.df + df.loc[3, :] = [np.nan, np.nan, np.nan] + result = get_dummies(df, dummy_na=True) + expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': [1., 0, 1, 0], + 'A_b': [0., 1, 0, 0], 'A_nan': [0., 0, 0, 1], 'B_b': [1., 1, 0, 0], + 'B_c': [0., 0, 1, 0], 'B_nan': [0., 0, 0, 1]}) + expected = expected[['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c', + 'B_nan']] + assert_frame_equal(result, expected) + + result = get_dummies(df, dummy_na=False) + expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] + assert_frame_equal(result, expected) + + def test_dataframe_dummies_with_categorical(self): + df = self.df + df['cat'] = pd.Categorical(['x', 'y', 'y']) + result = get_dummies(df) + expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1], + 'A_b': [0., 1, 0], 'B_b': [1., 1, 0], + 'B_c': [0., 0, 1], 'cat_x': [1., 0, 0], + 'cat_y': [0., 1, 1]}) + expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c', + 'cat_x', 'cat_y']] + assert_frame_equal(result, expected) + + class TestConvertDummies(tm.TestCase): def test_convert_dummies(self): df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', @@ -218,8 +331,9 @@ def test_convert_dummies(self): 'C': np.random.randn(8), 'D': np.random.randn(8)}) - result = convert_dummies(df, ['A', 'B']) - result2 = convert_dummies(df, ['A', 'B'], prefix_sep='.') + with tm.assert_produces_warning(FutureWarning): + result = convert_dummies(df, ['A', 'B']) + result2 = convert_dummies(df, ['A', 'B'], prefix_sep='.') expected = DataFrame({'A_foo': [1, 0, 1, 0, 1, 0, 1, 1], 'A_bar': [0, 1, 0, 1, 0, 1, 0, 0],