DOC: correct docstring examples (pandas-dev#3439) (pandas-dev#16432)

stangirala · Jun 11, 2017 · 105f5b8 · 105f5b8
1 parent c701c65
commit 105f5b8
Show file tree

Hide file tree

Showing 5 changed files with 108 additions and 80 deletions.
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
@@ -59,6 +59,15 @@ if [ "$DOC" ]; then
     git remote -v
 
     git push origin gh-pages -f
+
+    echo "Running doctests"
+    cd "$TRAVIS_BUILD_DIR"
+    pytest --doctest-modules \
+           pandas/core/reshape/concat.py \
+           pandas/core/reshape/pivot.py \
+           pandas/core/reshape/reshape.py \
+           pandas/core/reshape/tile.py
+
 fi
 
 exit 0
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
@@ -197,6 +197,8 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
        0
     a  2
     >>> pd.concat([df5, df6], verify_integrity=True)
+    Traceback (most recent call last):
+        ...
     ValueError: Indexes have overlapping values: ['a']
     """
     op = _Concatenator(objs, axis=axis, join_axes=join_axes,

diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
@@ -50,26 +50,36 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
 
     Examples
     --------
+    >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
+    ...                          "bar", "bar", "bar", "bar"],
+    ...                    "B": ["one", "one", "one", "two", "two",
+    ...                          "one", "one", "two", "two"],
+    ...                    "C": ["small", "large", "large", "small",
+    ...                          "small", "large", "small", "small",
+    ...                          "large"],
+    ...                    "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]})
     >>> df
-       A   B   C      D
-    0  foo one small  1
-    1  foo one large  2
-    2  foo one large  2
-    3  foo two small  3
-    4  foo two small  3
-    5  bar one large  4
-    6  bar one small  5
-    7  bar two small  6
-    8  bar two large  7
+         A    B      C  D
+    0  foo  one  small  1
+    1  foo  one  large  2
+    2  foo  one  large  2
+    3  foo  two  small  3
+    4  foo  two  small  3
+    5  bar  one  large  4
+    6  bar  one  small  5
+    7  bar  two  small  6
+    8  bar  two  large  7
 
     >>> table = pivot_table(df, values='D', index=['A', 'B'],
     ...                     columns=['C'], aggfunc=np.sum)
     >>> table
-              small  large
-    foo  one  1      4
-         two  6      NaN
-    bar  one  5      4
-         two  6      7
+    ... # doctest: +NORMALIZE_WHITESPACE
+    C        large  small
+    A   B
+    bar one    4.0    5.0
+        two    7.0    6.0
+    foo one    4.0    1.0
+        two    NaN    6.0
 
     Returns
     -------
@@ -445,27 +455,27 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None,
 
     Examples
     --------
-    >>> a
-    array([foo, foo, foo, foo, bar, bar,
-           bar, bar, foo, foo, foo], dtype=object)
-    >>> b
-    array([one, one, one, two, one, one,
-           one, two, two, two, one], dtype=object)
-    >>> c
-    array([dull, dull, shiny, dull, dull, shiny,
-           shiny, dull, shiny, shiny, shiny], dtype=object)
-
-    >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
-    b    one          two
-    c    dull  shiny  dull  shiny
+    >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
+    ...               "bar", "bar", "foo", "foo", "foo"], dtype=object)
+    >>> b = np.array(["one", "one", "one", "two", "one", "one",
+    ...               "one", "two", "two", "two", "one"], dtype=object)
+    >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",
+    ...               "shiny", "dull", "shiny", "shiny", "shiny"],
+    ...               dtype=object)
+
+    >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
+    ... # doctest: +NORMALIZE_WHITESPACE
+    b   one        two
+    c   dull shiny dull shiny
     a
-    bar  1     2      1     0
-    foo  2     2      1     2
+    bar    1     2    1     0
+    foo    2     2    1     2
 
     >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
     >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
     >>> crosstab(foo, bar)  # 'c' and 'f' are not represented in the data,
-                            # but they still will be counted in the output
+    ...                     # but they still will be counted in the output
+    ... # doctest: +SKIP
     col_0  d  e  f
     row_0
     a      1  0  0

diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -48,23 +48,23 @@ class _Unstacker(object):
     >>> import pandas as pd
     >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
     ...                                    ('two', 'a'), ('two', 'b')])
-    >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
+    >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index)
     >>> s
-    one  a   1
-         b   2
-    two  a   3
-         b   4
-    dtype: float64
+    one  a    1
+         b    2
+    two  a    3
+         b    4
+    dtype: int64
 
     >>> s.unstack(level=-1)
-         a   b
+         a  b
     one  1  2
     two  3  4
 
     >>> s.unstack(level=0)
        one  two
-    a  1   2
-    b  3   4
+    a    1    3
+    b    2    4
 
     Returns
     -------
@@ -789,18 +789,18 @@ def lreshape(data, groups, dropna=True, label=None):
     >>> import pandas as pd
     >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],
     ...                      'team': ['Red Sox', 'Yankees'],
-    ...                      'year1': [2007, 2008], 'year2': [2008, 2008]})
+    ...                      'year1': [2007, 2007], 'year2': [2008, 2008]})
     >>> data
        hr1  hr2     team  year1  year2
     0  514  545  Red Sox   2007   2008
     1  573  526  Yankees   2007   2008
 
     >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})
-          team   hr  year
-    0  Red Sox  514  2007
-    1  Yankees  573  2007
-    2  Red Sox  545  2008
-    3  Yankees  526  2008
+          team  year   hr
+    0  Red Sox  2007  514
+    1  Yankees  2007  573
+    2  Red Sox  2008  545
+    3  Yankees  2008  526
 
     Returns
     -------
@@ -905,11 +905,12 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'):
     ...                   })
     >>> df["id"] = df.index
     >>> df
-    A1970 A1980  B1970  B1980         X  id
+      A1970 A1980  B1970  B1980         X  id
     0     a     d    2.5    3.2 -1.085631   0
     1     b     e    1.2    1.3  0.997345   1
     2     c     f    0.7    0.1  0.282978   2
     >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
+    ... # doctest: +NORMALIZE_WHITESPACE
                     X  A    B
     id year
     0  1970 -1.085631  a  2.5
@@ -940,6 +941,7 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'):
     8      3      3  2.1  2.9
     >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
     >>> l
+    ... # doctest: +NORMALIZE_WHITESPACE
                       ht
     famid birth age
     1     1     1    2.8
@@ -979,41 +981,44 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'):
 
     Less wieldy column names are also handled
 
+    >>> np.random.seed(0)
     >>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3),
     ...                    'A(quarterly)-2011': np.random.rand(3),
     ...                    'B(quarterly)-2010': np.random.rand(3),
     ...                    'B(quarterly)-2011': np.random.rand(3),
     ...                    'X' : np.random.randint(3, size=3)})
     >>> df['id'] = df.index
-    >>> df
-      A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011
-    0          0.531828          0.724455          0.322959          0.293714
-    1          0.634401          0.611024          0.361789          0.630976
-    2          0.849432          0.722443          0.228263          0.092105
-    \
+    >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
+       A(quarterly)-2010  A(quarterly)-2011  B(quarterly)-2010  ...
+    0           0.548814           0.544883           0.437587  ...
+    1           0.715189           0.423655           0.891773  ...
+    2           0.602763           0.645894           0.963663  ...
        X  id
     0  0   0
     1  1   1
-    2  2   2
-    >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'],
-                        i='id', j='year', sep='-')
-             X     A(quarterly)  B(quarterly)
+    2  1   2
+
+    >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'], i='id',
+    ...                 j='year', sep='-')
+    ... # doctest: +NORMALIZE_WHITESPACE
+             X  A(quarterly)  B(quarterly)
     id year
-    0  2010  0       0.531828       0.322959
-    1  2010  2       0.634401       0.361789
-    2  2010  2       0.849432       0.228263
-    0  2011  0       0.724455       0.293714
-    1  2011  2       0.611024       0.630976
-    2  2011  2       0.722443       0.092105
+    0  2010  0      0.548814     0.437587
+    1  2010  1      0.715189     0.891773
+    2  2010  1      0.602763     0.963663
+    0  2011  0      0.544883     0.383442
+    1  2011  1      0.423655     0.791725
+    2  2011  1      0.645894     0.528895
 
     If we have many columns, we could also use a regex to find our
     stubnames and pass that list on to wide_to_long
 
-    >>> stubnames = set([match[0] for match in
-                        df.columns.str.findall('[A-B]\(.*\)').values
-                        if match != [] ])
+    >>> stubnames = sorted(
+    ...     set([match[0] for match in df.columns.str.findall(
+    ...         r'[A-B]\(.*\)').values if match != [] ])
+    ... )
     >>> list(stubnames)
-    ['B(quarterly)', 'A(quarterly)']
+    ['A(quarterly)', 'B(quarterly)']
 
     Notes
     -----
@@ -1133,7 +1138,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
     2  0  0    1
 
     >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
-                        'C': [1, 2, 3]})
+    ...                    'C': [1, 2, 3]})
 
     >>> pd.get_dummies(df, prefix=['col1', 'col2'])
        C  col1_a  col1_b  col2_a  col2_b  col2_c
@@ -1149,7 +1154,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
     3  1  0  0
     4  1  0  0
 
-    >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True))
+    >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
        b  c
     0  0  0
     1  1  0

diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -75,18 +75,18 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
     Examples
     --------
     >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True)
-    ([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533],
-      (6.533, 9.7], (0.191, 3.367]]
-    Categories (3, object): [(0.191, 3.367] < (3.367, 6.533] < (6.533, 9.7]],
-    array([ 0.1905    ,  3.36666667,  6.53333333,  9.7       ]))
+    ... # doctest: +ELLIPSIS
+    ([(0.19, 3.367], (0.19, 3.367], (0.19, 3.367], (3.367, 6.533], ...
+    Categories (3, interval[float64]): [(0.19, 3.367] < (3.367, 6.533] ...
 
-    >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3,
-               labels=["good","medium","bad"])
+    >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]),
+    ...        3, labels=["good", "medium", "bad"])
+    ... # doctest: +SKIP
     [good, good, good, medium, bad, good]
     Categories (3, object): [good < medium < bad]
 
     >>> pd.cut(np.ones(5), 4, labels=False)
-    array([1, 1, 1, 1, 1], dtype=int64)
+    array([1, 1, 1, 1, 1])
     """
     # NOTE: this binning code is changed a bit from histogram for var(x) == 0
 
@@ -182,15 +182,17 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'):
     Examples
     --------
     >>> pd.qcut(range(5), 4)
-    [[0, 1], [0, 1], (1, 2], (2, 3], (3, 4]]
-    Categories (4, object): [[0, 1] < (1, 2] < (2, 3] < (3, 4]]
+    ... # doctest: +ELLIPSIS
+    [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
+    Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] ...
 
-    >>> pd.qcut(range(5), 3, labels=["good","medium","bad"])
+    >>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"])
+    ... # doctest: +SKIP
     [good, good, medium, bad, bad]
     Categories (3, object): [good < medium < bad]
 
     >>> pd.qcut(range(5), 4, labels=False)
-    array([0, 0, 1, 2, 3], dtype=int64)
+    array([0, 0, 1, 2, 3])
     """
     x_is_series, series_index, name, x = _preprocess_for_cut(x)