diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 26917b8f9b792e..a038304fe0f7ae 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -59,6 +59,15 @@ if [ "$DOC" ]; then git remote -v git push origin gh-pages -f + + echo "Running doctests" + cd "$TRAVIS_BUILD_DIR" + pytest --doctest-modules \ + pandas/core/reshape/concat.py \ + pandas/core/reshape/pivot.py \ + pandas/core/reshape/reshape.py \ + pandas/core/reshape/tile.py + fi exit 0 diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index af2eb734a02f6d..96603b6adc3b0d 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -197,6 +197,8 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, 0 a 2 >>> pd.concat([df5, df6], verify_integrity=True) + Traceback (most recent call last): + ... ValueError: Indexes have overlapping values: ['a'] """ op = _Concatenator(objs, axis=axis, join_axes=join_axes, diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index b562f8a32f5c9d..0581ec7484c491 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -50,26 +50,36 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', Examples -------- + >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", + ... "bar", "bar", "bar", "bar"], + ... "B": ["one", "one", "one", "two", "two", + ... "one", "one", "two", "two"], + ... "C": ["small", "large", "large", "small", + ... "small", "large", "small", "small", + ... "large"], + ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]}) >>> df - A B C D - 0 foo one small 1 - 1 foo one large 2 - 2 foo one large 2 - 3 foo two small 3 - 4 foo two small 3 - 5 bar one large 4 - 6 bar one small 5 - 7 bar two small 6 - 8 bar two large 7 + A B C D + 0 foo one small 1 + 1 foo one large 2 + 2 foo one large 2 + 3 foo two small 3 + 4 foo two small 3 + 5 bar one large 4 + 6 bar one small 5 + 7 bar two small 6 + 8 bar two large 7 >>> table = pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum) >>> table - small large - foo one 1 4 - two 6 NaN - bar one 5 4 - two 6 7 + ... # doctest: +NORMALIZE_WHITESPACE + C large small + A B + bar one 4.0 5.0 + two 7.0 6.0 + foo one 4.0 1.0 + two NaN 6.0 Returns ------- @@ -445,27 +455,27 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, Examples -------- - >>> a - array([foo, foo, foo, foo, bar, bar, - bar, bar, foo, foo, foo], dtype=object) - >>> b - array([one, one, one, two, one, one, - one, two, two, two, one], dtype=object) - >>> c - array([dull, dull, shiny, dull, dull, shiny, - shiny, dull, shiny, shiny, shiny], dtype=object) - - >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) - b one two - c dull shiny dull shiny + >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar", + ... "bar", "bar", "foo", "foo", "foo"], dtype=object) + >>> b = np.array(["one", "one", "one", "two", "one", "one", + ... "one", "two", "two", "two", "one"], dtype=object) + >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny", + ... "shiny", "dull", "shiny", "shiny", "shiny"], + ... dtype=object) + + >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) + ... # doctest: +NORMALIZE_WHITESPACE + b one two + c dull shiny dull shiny a - bar 1 2 1 0 - foo 2 2 1 2 + bar 1 2 1 0 + foo 2 2 1 2 >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) >>> crosstab(foo, bar) # 'c' and 'f' are not represented in the data, - # but they still will be counted in the output + ... # but they still will be counted in the output + ... # doctest: +SKIP col_0 d e f row_0 a 1 0 0 diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index f944dfe22361a1..dcb83d225699d5 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -48,23 +48,23 @@ class _Unstacker(object): >>> import pandas as pd >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), ... ('two', 'a'), ('two', 'b')]) - >>> s = pd.Series(np.arange(1.0, 5.0), index=index) + >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index) >>> s - one a 1 - b 2 - two a 3 - b 4 - dtype: float64 + one a 1 + b 2 + two a 3 + b 4 + dtype: int64 >>> s.unstack(level=-1) - a b + a b one 1 2 two 3 4 >>> s.unstack(level=0) one two - a 1 2 - b 3 4 + a 1 3 + b 2 4 Returns ------- @@ -789,18 +789,18 @@ def lreshape(data, groups, dropna=True, label=None): >>> import pandas as pd >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526], ... 'team': ['Red Sox', 'Yankees'], - ... 'year1': [2007, 2008], 'year2': [2008, 2008]}) + ... 'year1': [2007, 2007], 'year2': [2008, 2008]}) >>> data hr1 hr2 team year1 year2 0 514 545 Red Sox 2007 2008 1 573 526 Yankees 2007 2008 >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']}) - team hr year - 0 Red Sox 514 2007 - 1 Yankees 573 2007 - 2 Red Sox 545 2008 - 3 Yankees 526 2008 + team year hr + 0 Red Sox 2007 514 + 1 Yankees 2007 573 + 2 Red Sox 2008 545 + 3 Yankees 2008 526 Returns ------- @@ -905,11 +905,12 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): ... }) >>> df["id"] = df.index >>> df - A1970 A1980 B1970 B1980 X id + A1970 A1980 B1970 B1980 X id 0 a d 2.5 3.2 -1.085631 0 1 b e 1.2 1.3 0.997345 1 2 c f 0.7 0.1 0.282978 2 >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year") + ... # doctest: +NORMALIZE_WHITESPACE X A B id year 0 1970 -1.085631 a 2.5 @@ -940,6 +941,7 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): 8 3 3 2.1 2.9 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') >>> l + ... # doctest: +NORMALIZE_WHITESPACE ht famid birth age 1 1 1 2.8 @@ -979,41 +981,44 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): Less wieldy column names are also handled + >>> np.random.seed(0) >>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3), ... 'A(quarterly)-2011': np.random.rand(3), ... 'B(quarterly)-2010': np.random.rand(3), ... 'B(quarterly)-2011': np.random.rand(3), ... 'X' : np.random.randint(3, size=3)}) >>> df['id'] = df.index - >>> df - A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011 - 0 0.531828 0.724455 0.322959 0.293714 - 1 0.634401 0.611024 0.361789 0.630976 - 2 0.849432 0.722443 0.228263 0.092105 - \ + >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS + A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 ... + 0 0.548814 0.544883 0.437587 ... + 1 0.715189 0.423655 0.891773 ... + 2 0.602763 0.645894 0.963663 ... X id 0 0 0 1 1 1 - 2 2 2 - >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'], - i='id', j='year', sep='-') - X A(quarterly) B(quarterly) + 2 1 2 + + >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'], i='id', + ... j='year', sep='-') + ... # doctest: +NORMALIZE_WHITESPACE + X A(quarterly) B(quarterly) id year - 0 2010 0 0.531828 0.322959 - 1 2010 2 0.634401 0.361789 - 2 2010 2 0.849432 0.228263 - 0 2011 0 0.724455 0.293714 - 1 2011 2 0.611024 0.630976 - 2 2011 2 0.722443 0.092105 + 0 2010 0 0.548814 0.437587 + 1 2010 1 0.715189 0.891773 + 2 2010 1 0.602763 0.963663 + 0 2011 0 0.544883 0.383442 + 1 2011 1 0.423655 0.791725 + 2 2011 1 0.645894 0.528895 If we have many columns, we could also use a regex to find our stubnames and pass that list on to wide_to_long - >>> stubnames = set([match[0] for match in - df.columns.str.findall('[A-B]\(.*\)').values - if match != [] ]) + >>> stubnames = sorted( + ... set([match[0] for match in df.columns.str.findall( + ... r'[A-B]\(.*\)').values if match != [] ]) + ... ) >>> list(stubnames) - ['B(quarterly)', 'A(quarterly)'] + ['A(quarterly)', 'B(quarterly)'] Notes ----- @@ -1133,7 +1138,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, 2 0 0 1 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], - 'C': [1, 2, 3]}) + ... 'C': [1, 2, 3]}) >>> pd.get_dummies(df, prefix=['col1', 'col2']) C col1_a col1_b col2_a col2_b col2_c @@ -1149,7 +1154,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, 3 1 0 0 4 1 0 0 - >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)) + >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) b c 0 0 0 1 1 0 diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 746742f47f2aa9..866f229bec4189 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -75,18 +75,18 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Examples -------- >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True) - ([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533], - (6.533, 9.7], (0.191, 3.367]] - Categories (3, object): [(0.191, 3.367] < (3.367, 6.533] < (6.533, 9.7]], - array([ 0.1905 , 3.36666667, 6.53333333, 9.7 ])) + ... # doctest: +ELLIPSIS + ([(0.19, 3.367], (0.19, 3.367], (0.19, 3.367], (3.367, 6.533], ... + Categories (3, interval[float64]): [(0.19, 3.367] < (3.367, 6.533] ... - >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, - labels=["good","medium","bad"]) + >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), + ... 3, labels=["good", "medium", "bad"]) + ... # doctest: +SKIP [good, good, good, medium, bad, good] Categories (3, object): [good < medium < bad] >>> pd.cut(np.ones(5), 4, labels=False) - array([1, 1, 1, 1, 1], dtype=int64) + array([1, 1, 1, 1, 1]) """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 @@ -182,15 +182,17 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): Examples -------- >>> pd.qcut(range(5), 4) - [[0, 1], [0, 1], (1, 2], (2, 3], (3, 4]] - Categories (4, object): [[0, 1] < (1, 2] < (2, 3] < (3, 4]] + ... # doctest: +ELLIPSIS + [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]] + Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] ... - >>> pd.qcut(range(5), 3, labels=["good","medium","bad"]) + >>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"]) + ... # doctest: +SKIP [good, good, medium, bad, bad] Categories (3, object): [good < medium < bad] >>> pd.qcut(range(5), 4, labels=False) - array([0, 0, 1, 2, 3], dtype=int64) + array([0, 0, 1, 2, 3]) """ x_is_series, series_index, name, x = _preprocess_for_cut(x)