From 3e6a47b066e4709ca67f0116866aaae049eb4cf0 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Tue, 17 Jul 2018 23:30:36 +0100 Subject: [PATCH 1/7] WIP: DataFrame mode docstring --- pandas/core/frame.py | 60 +++++++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4578d2ac08199..36981cb383ffb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7301,23 +7301,22 @@ def _get_agg_axis(self, axis_num): def mode(self, axis=0, numeric_only=False, dropna=True): """ - Gets the mode(s) of each element along the axis selected. Adds a row - for each mode per label, fills in gaps with nan. + Get the mode(s) of each element along the selected axis. - Note that there could be multiple values returned for the selected - axis (when more than one item share the maximum frequency), which is - the reason why a dataframe is returned. If you want to impute missing - values with the mode in a dataframe ``df``, you can just do this: - ``df.fillna(df.mode().iloc[0])`` + The mode of a set of values is the value or set of values that appear + most often. + + Adds a row for each mode per label, filling gaps with NaN. Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 - * 0 or 'index' : get mode of each column - * 1 or 'columns' : get mode of each row - numeric_only : boolean, default False - if True, only apply to numeric columns - dropna : boolean, default True + The axis to iterate over while searching for the mode. + To find the mode for each column, use ``axis=0``. + To find the mode for each row, use ``axis=1``. + numeric_only : bool, default False + If True, only apply to numeric columns. + dropna : bool, default True Don't consider counts of NaN/NaT. .. versionadded:: 0.24.0 @@ -7325,14 +7324,41 @@ def mode(self, axis=0, numeric_only=False, dropna=True): Returns ------- modes : DataFrame (sorted) + A DataFrame containing the modes + If ``axis=0``, there will be one column per column in the original + DataFrame, with as many rows as there are modes. + If ``axis=1``, there will be one row per row in the original + DataFrame, with as many columns as there are modes. + + Notes + ----- + There may be multiple values returned for the selected + axis when more than one item share the maximum frequency, which is + the reason why a DataFrame is returned. + + See Also + -------- + Series.mode : Return the highest frequency value in a Series. + Series.value_counts : Return the counts of values in a Series. Examples -------- - >>> df = pd.DataFrame({'A': [1, 2, 1, 2, 1, 2, 3]}) - >>> df.mode() - A - 0 1 - 1 2 + + ``mode`` returns a DataFrame with multiple rows if there is more than + one mode. Missing entries are imputed with NaN. + + >>> df = pd.DataFrame({'name': ['Alice', 'Bob', 'Alice', 'Bob'], + ... 'age': [21, 45, 33, 21]}) + >>> df + name age + 0 Alice 21 + 1 Bob 45 + 2 Alice 33 + 3 Bob 21 + >>> df.mode() + name age + 0 Alice 21.0 + 1 Bob NaN """ data = self if not numeric_only else self._get_numeric_data() From 806c3875efbfecf1ec940ccee30bba044c5163a4 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Wed, 18 Jul 2018 17:09:02 +0100 Subject: [PATCH 2/7] WIP: Some more fixes to the DataFrame.mode docstring --- pandas/core/frame.py | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 36981cb383ffb..29b30b9ebcf73 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7303,8 +7303,8 @@ def mode(self, axis=0, numeric_only=False, dropna=True): """ Get the mode(s) of each element along the selected axis. - The mode of a set of values is the value or set of values that appear - most often. + The mode of a set of values is the value that appears most often. + It can be multiple values. Adds a row for each mode per label, filling gaps with NaN. @@ -7323,19 +7323,13 @@ def mode(self, axis=0, numeric_only=False, dropna=True): Returns ------- - modes : DataFrame (sorted) + DataFrame (sorted) A DataFrame containing the modes If ``axis=0``, there will be one column per column in the original DataFrame, with as many rows as there are modes. If ``axis=1``, there will be one row per row in the original DataFrame, with as many columns as there are modes. - Notes - ----- - There may be multiple values returned for the selected - axis when more than one item share the maximum frequency, which is - the reason why a DataFrame is returned. - See Also -------- Series.mode : Return the highest frequency value in a Series. @@ -7350,15 +7344,15 @@ def mode(self, axis=0, numeric_only=False, dropna=True): >>> df = pd.DataFrame({'name': ['Alice', 'Bob', 'Alice', 'Bob'], ... 'age': [21, 45, 33, 21]}) >>> df - name age - 0 Alice 21 - 1 Bob 45 - 2 Alice 33 - 3 Bob 21 - >>> df.mode() - name age - 0 Alice 21.0 - 1 Bob NaN + name age + 0 Alice 21 + 1 Bob 45 + 2 Alice 33 + 3 Bob 21 + >>> df.mode() + name age + 0 Alice 21.0 + 1 Bob NaN """ data = self if not numeric_only else self._get_numeric_data() From 6d4d521c0cfac516522f8bb56ae25ab27897a41c Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Thu, 16 Aug 2018 16:49:06 +0100 Subject: [PATCH 3/7] Improving examples and notes --- pandas/core/frame.py | 52 +++++++++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4a02f5fc0f6a8..ecec6380bf982 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7237,7 +7237,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True): Returns ------- - DataFrame (sorted) + DataFrame A DataFrame containing the modes If ``axis=0``, there will be one column per column in the original DataFrame, with as many rows as there are modes. @@ -7249,24 +7249,50 @@ def mode(self, axis=0, numeric_only=False, dropna=True): Series.mode : Return the highest frequency value in a Series. Series.value_counts : Return the counts of values in a Series. + Notes + ----- + If there is only one occurrence of each value (no repeated values), + no mode will be returned. + Examples -------- + >>> df = pd.DataFrame([('bird', 2, 2), + ... ('mammal', 4, 0), + ... ('insect', 8, 0), + ... ('bird', 2, 2)], + ... index=('penguin', 'horse', 'spider', 'ostrich'), + ... columns=('species', 'legs', 'wings')) + >>> df + species legs wings + penguin bird 2 2 + horse mammal 4 0 + spider insect 8 0 + ostrich bird 2 2 + ``mode`` returns a DataFrame with multiple rows if there is more than - one mode. Missing entries are imputed with NaN. + one mode (like for wings). Missing entries are imputed with NaN: - >>> df = pd.DataFrame({'name': ['Alice', 'Bob', 'Alice', 'Bob'], - ... 'age': [21, 45, 33, 21]}) - >>> df - name age - 0 Alice 21 - 1 Bob 45 - 2 Alice 33 - 3 Bob 21 >>> df.mode() - name age - 0 Alice 21.0 - 1 Bob NaN + species legs wings + 0 bird 2.0 0 + 1 NaN NaN 2 + + The mode of only numeric columns can be computed: + + >>> df.mode(numeric_only=True) + legs wings + 0 2.0 0 + 1 NaN 2 + + To compute the mode over columns and not rows, use the axis parameter: + + >>> df.mode(axis='columns') + 0 + penguin 2.0 + horse NaN + spider NaN + ostrich 2.0 """ data = self if not numeric_only else self._get_numeric_data() From 1f36b7c7a494f95712e040f3101499230c9560f8 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Fri, 17 Aug 2018 21:29:18 +0100 Subject: [PATCH 4/7] Finishing the examples, adding one for dropna, and some last fixes. --- pandas/core/frame.py | 70 +++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 33 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f70a38d899416..914f3f31a59a5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7218,14 +7218,12 @@ def mode(self, axis=0, numeric_only=False, dropna=True): The mode of a set of values is the value that appears most often. It can be multiple values. - Adds a row for each mode per label, filling gaps with NaN. - Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to iterate over while searching for the mode. - To find the mode for each column, use ``axis=0``. - To find the mode for each row, use ``axis=1``. + To find the mode for each column, use ``axis='index'``. + To find the mode for each row, use ``axis='columns'``. numeric_only : bool, default False If True, only apply to numeric columns. dropna : bool, default True @@ -7236,11 +7234,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True): Returns ------- DataFrame - A DataFrame containing the modes - If ``axis=0``, there will be one column per column in the original - DataFrame, with as many rows as there are modes. - If ``axis=1``, there will be one row per row in the original - DataFrame, with as many columns as there are modes. + The modes of each column or row. See Also -------- @@ -7249,48 +7243,58 @@ def mode(self, axis=0, numeric_only=False, dropna=True): Notes ----- - If there is only one occurrence of each value (no repeated values), - no mode will be returned. + Every column or row of the resulting DataFrame contains all its modes. + And possibly NaN values at the end (if other columns or rows have a + higher number of modes). Examples -------- >>> df = pd.DataFrame([('bird', 2, 2), - ... ('mammal', 4, 0), - ... ('insect', 8, 0), - ... ('bird', 2, 2)], - ... index=('penguin', 'horse', 'spider', 'ostrich'), + ... ('mammal', 4, np.nan), + ... ('arthropod', 8, 0), + ... ('bird', 2, np.nan)], + ... index=('falcon', 'horse', 'spider', 'ostrich'), ... columns=('species', 'legs', 'wings')) >>> df - species legs wings - penguin bird 2 2 - horse mammal 4 0 - spider insect 8 0 - ostrich bird 2 2 + species legs wings + falcon bird 2 2.0 + horse mammal 4 NaN + spider arthropod 8 0.0 + ostrich bird 2 NaN - ``mode`` returns a DataFrame with multiple rows if there is more than - one mode (like for wings). Missing entries are imputed with NaN: + By default, missing values are not considered, and the mode of winds + are both 0 and 2. The second row of species and legs contains NaN, + because they have only one mode, but the DataFrame has two rows. >>> df.mode() species legs wings - 0 bird 2.0 0 - 1 NaN NaN 2 + 0 bird 2.0 0.0 + 1 NaN NaN 2.0 + + Setting ``dropna=False`` NaN values are considered and they can be the + mode (like for wings). + + >>> df.mode(dropna=False) + species legs wings + 0 bird 2 NaN - The mode of only numeric columns can be computed: + Setting ``numeric_only=True``, only the mode of numeric columns is + computed, and columns of other types are ignored. >>> df.mode(numeric_only=True) legs wings - 0 2.0 0 - 1 NaN 2 + 0 2.0 0.0 + 1 NaN 2.0 To compute the mode over columns and not rows, use the axis parameter: - >>> df.mode(axis='columns') - 0 - penguin 2.0 - horse NaN - spider NaN - ostrich 2.0 + >>> df.mode(axis='columns', numeric_only=True) + 0 1 + falcon 2.0 NaN + horse 4.0 NaN + spider 0.0 8.0 + ostrich 2.0 NaN """ data = self if not numeric_only else self._get_numeric_data() From 74bae39ee3710ceb126125ba226edbc30e8f47f0 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sat, 18 Aug 2018 15:06:08 +0100 Subject: [PATCH 5/7] Adding unnecessary blank line --- pandas/core/frame.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 914f3f31a59a5..ec47af443e6f1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7249,7 +7249,6 @@ def mode(self, axis=0, numeric_only=False, dropna=True): Examples -------- - >>> df = pd.DataFrame([('bird', 2, 2), ... ('mammal', 4, np.nan), ... ('arthropod', 8, 0), From ed7b792c1f1028c94f747399f101c869342af031 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Mon, 20 Aug 2018 10:49:51 +0100 Subject: [PATCH 6/7] Updating DataFrame.mode docstring --- pandas/core/frame.py | 80 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 65 insertions(+), 15 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7a10fb1023806..20e4fc67a7252 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7217,38 +7217,88 @@ def _get_agg_axis(self, axis_num): def mode(self, axis=0, numeric_only=False, dropna=True): """ - Gets the mode(s) of each element along the axis selected. Adds a row - for each mode per label, fills in gaps with nan. + Get the mode(s) of each element along the selected axis. - Note that there could be multiple values returned for the selected - axis (when more than one item share the maximum frequency), which is - the reason why a dataframe is returned. If you want to impute missing - values with the mode in a dataframe ``df``, you can just do this: - ``df.fillna(df.mode().iloc[0])`` + The mode of a set of values is the value that appears most often. + It can be multiple values. Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to iterate over while searching for the mode: + * 0 or 'index' : get mode of each column * 1 or 'columns' : get mode of each row - numeric_only : boolean, default False - if True, only apply to numeric columns - dropna : boolean, default True + numeric_only : bool, default False + If True, only apply to numeric columns. + dropna : bool, default True Don't consider counts of NaN/NaT. .. versionadded:: 0.24.0 Returns ------- - modes : DataFrame (sorted) + DataFrame + The modes of each column or row. + + See Also + -------- + Series.mode : Return the highest frequency value in a Series. + Series.value_counts : Return the counts of values in a Series. + + Notes + ----- + Every column or row of the resulting DataFrame contains all its modes. + And possibly NaN values at the end (if other columns or rows have a + higher number of modes). Examples -------- - >>> df = pd.DataFrame({'A': [1, 2, 1, 2, 1, 2, 3]}) + >>> df = pd.DataFrame([('bird', 2, 2), + ... ('mammal', 4, np.nan), + ... ('arthropod', 8, 0), + ... ('bird', 2, np.nan)], + ... index=('falcon', 'horse', 'spider', 'ostrich'), + ... columns=('species', 'legs', 'wings')) + >>> df + species legs wings + falcon bird 2 2.0 + horse mammal 4 NaN + spider arthropod 8 0.0 + ostrich bird 2 NaN + + By default, missing values are not considered, and the mode of winds + are both 0 and 2. The second row of species and legs contains NaN, + because they have only one mode, but the DataFrame has two rows. + >>> df.mode() - A - 0 1 - 1 2 + species legs wings + 0 bird 2.0 0.0 + 1 NaN NaN 2.0 + + Setting ``dropna=False`` NaN values are considered and they can be the + mode (like for wings). + + >>> df.mode(dropna=False) + species legs wings + 0 bird 2 NaN + + Setting ``numeric_only=True``, only the mode of numeric columns is + computed, and columns of other types are ignored. + + >>> df.mode(numeric_only=True) + legs wings + 0 2.0 0.0 + 1 NaN 2.0 + + To compute the mode over columns and not rows, use the axis parameter: + + >>> df.mode(axis='columns', numeric_only=True) + 0 1 + falcon 2.0 NaN + horse 4.0 NaN + spider 0.0 8.0 + ostrich 2.0 NaN """ data = self if not numeric_only else self._get_numeric_data() From 5a410bf7f9cee53b623cb119dca0a98c7cad5b9d Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sun, 2 Sep 2018 18:50:00 +0200 Subject: [PATCH 7/7] Addressing comments from code review --- pandas/core/frame.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e96d502f4ccad..802e4102ba4ed 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7240,12 +7240,6 @@ def mode(self, axis=0, numeric_only=False, dropna=True): Series.mode : Return the highest frequency value in a Series. Series.value_counts : Return the counts of values in a Series. - Notes - ----- - Every column or row of the resulting DataFrame contains all its modes. - And possibly NaN values at the end (if other columns or rows have a - higher number of modes). - Examples -------- >>> df = pd.DataFrame([('bird', 2, 2), @@ -7261,8 +7255,8 @@ def mode(self, axis=0, numeric_only=False, dropna=True): spider arthropod 8 0.0 ostrich bird 2 NaN - By default, missing values are not considered, and the mode of winds - are both 0 and 2. The second row of species and legs contains NaN, + By default, missing values are not considered, and the mode of wings + are both 0 and 2. The second row of species and legs contains ``NaN``, because they have only one mode, but the DataFrame has two rows. >>> df.mode() @@ -7270,8 +7264,8 @@ def mode(self, axis=0, numeric_only=False, dropna=True): 0 bird 2.0 0.0 1 NaN NaN 2.0 - Setting ``dropna=False`` NaN values are considered and they can be the - mode (like for wings). + Setting ``dropna=False`` ``NaN`` values are considered and they can be + the mode (like for wings). >>> df.mode(dropna=False) species legs wings