Merge remote-tracking branch 'upstream/master' into Rt05

* upstream/master: BUG: Fix exceptions when Series.interpolate's `order` parameter is missing or invalid (pandas-dev#25246) API: Ensure DatetimeTZDtype standardizes pytz timezones (pandas-dev#25254) Split Excel IO Into Sub-Directory (pandas-dev#25153) PR04 errors fix (pandas-dev#25157) DEPR: remove assert_panel_equal (pandas-dev#25238) BUG: pandas Timestamp tz_localize and tz_convert do not preserve `freq` attribute (pandas-dev#25247) Revert "BLD: prevent asv from calling sys.stdin.close() by using different launch method (pandas-dev#25237)" (pandas-dev#25253) REF/TST: resample/test_base.py (pandas-dev#25262) BUG: Duplicated returns boolean dataframe (pandas-dev#25234) CLN: Remove ipython 2.x compat (pandas-dev#25150) Refactor groupby group_add from tempita to fused types (pandas-dev#24954) CLN: For loops, boolean conditions, misc. (pandas-dev#25206) (Closes pandas-dev#25029) Removed extra bracket from cheatsheet code example. (pandas-dev#25032) BLD: prevent asv from calling sys.stdin.close() by using different launch method (pandas-dev#25237) BUG: Fix read_json orient='table' without index (pandas-dev#25170) (pandas-dev#25171) BUG: Fix regression in DataFrame.apply causing RecursionError (pandas-dev#25230) BUG-25061 fix printing indices with NaNs (pandas-dev#25202) DEPR: Add Deprecated warning for timedelta with passed units M and Y (pandas-dev#23264) DEPR: Remove Panel-specific parts of io.pytables (pandas-dev#25233) DEPR: remove tm.makePanel and all usages (pandas-dev#25231)
thoo · Feb 11, 2019 · 1824b99 · 1824b99
2 parents 25e7503 + ea1d5f5
commit 1824b99
Show file tree

Hide file tree

Showing 91 changed files with 2,536 additions and 4,887 deletions.
diff --git a/asv_bench/benchmarks/__init__.py b/asv_bench/benchmarks/__init__.py
@@ -0,0 +1 @@
+"""Pandas benchmarks."""
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -241,8 +241,8 @@ fi
 ### DOCSTRINGS ###
 if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
 
-    MSG='Validate docstrings (GL06, GL07, GL09, SS04, SS05, PR03, PR05, EX04, RT04, RT05, SA05)' ; echo $MSG
-    $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL06,GL07,GL09,SS04,SS05,PR03,PR05,EX04,RT04,RT05,SA05
+    MSG='Validate docstrings (GL06, GL07, GL09, SS04, SS05, PR03, PR04, PR05, EX04, RT04, RT05, SA05)' ; echo $MSG
+    $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL06,GL07,GL09,SS04,SS05,PR03,PR04,PR05,EX04,RT04,RT05,SA05
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
 fi

diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf
diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx
diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pdf
diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pptx
diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
@@ -321,6 +321,15 @@ which can be specified. These are computed from the starting point specified by
    pd.to_datetime([1349720105100, 1349720105200, 1349720105300,
                    1349720105400, 1349720105500], unit='ms')
 
+Constructing a :class:`Timestamp` or :class:`DatetimeIndex` with an epoch timestamp
+with the ``tz`` argument specified will localize the epoch timestamps to UTC
+first then convert the result to the specified time zone.
+
+.. ipython:: python
+
+   pd.Timestamp(1262347200000000000, tz='US/Pacific')
+   pd.DatetimeIndex([1262347200000000000], tz='US/Pacific')
+
 .. note::
 
    Epoch times will be rounded to the nearest nanosecond.
@@ -2205,6 +2214,21 @@ you can use the ``tz_convert`` method.
 
    rng_pytz.tz_convert('US/Eastern')
 
+.. note::
+
+    When using ``pytz`` time zones, :class:`DatetimeIndex` will construct a different
+    time zone object than a :class:`Timestamp` for the same time zone input. A :class:`DatetimeIndex`
+    can hold a collection of :class:`Timestamp` objects that may have different UTC offsets and cannot be
+    succinctly represented by one ``pytz`` time zone instance while one :class:`Timestamp`
+    represents one point in time with a specific UTC offset.
+
+    .. ipython:: python
+
+       dti = pd.date_range('2019-01-01', periods=3, freq='D', tz='US/Pacific')
+       dti.tz
+       ts = pd.Timestamp('2019-01-01', tz='US/Pacific')
+       ts.tz
+
 .. warning::
 
 	Be wary of conversions between libraries. For some time zones, ``pytz`` and ``dateutil`` have different

diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst
@@ -21,8 +21,10 @@ Fixed Regressions
 ^^^^^^^^^^^^^^^^^
 
 - Fixed regression in :meth:`DataFrame.all` and :meth:`DataFrame.any` where ``bool_only=True`` was ignored (:issue:`25101`)
-
 - Fixed issue in ``DataFrame`` construction with passing a mixed list of mixed types could segfault. (:issue:`25075`)
+- Fixed regression in :meth:`DataFrame.apply` causing ``RecursionError`` when ``dict``-like classes were passed as argument. (:issue:`25196`)
+
+- Fixed regression in :meth:`DataFrame.duplicated()`, where empty dataframe was not returning a boolean dtyped Series. (:issue:`25184`)
 
 .. _whatsnew_0242.enhancements:
 
@@ -52,7 +54,8 @@ Bug Fixes
 **I/O**
 
 - Bug in reading a HDF5 table-format ``DataFrame`` created in Python 2, in Python 3 (:issue:`24925`)
--
+- Bug in reading a JSON with ``orient='table'`` generated by :meth:`DataFrame.to_json` with ``index=False`` (:issue:`25170`)
+- Bug where float indexes could have misaligned values when printing (:issue:`25061`)
 -
 
 **Categorical**

diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -33,7 +33,7 @@ Backwards incompatible API changes
 Other API Changes
 ^^^^^^^^^^^^^^^^^
 
--
+- :class:`DatetimeTZDtype` will now standardize pytz timezones to a common timezone instance (:issue:`24713`)
 -
 -
 
@@ -42,16 +42,13 @@ Other API Changes
 Deprecations
 ~~~~~~~~~~~~
 
--
--
--
-
+- Deprecated the `M (months)` and `Y (year)` `units` parameter of :func: `pandas.to_timedelta`, :func: `pandas.Timedelta` and :func: `pandas.TimedeltaIndex` (:issue:`16344`)
 
 .. _whatsnew_0250.prior_deprecations:
 
 Removal of prior version deprecations/changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-- Removed (parts of) :class:`Panel` (:issue:`25047`)
+- Removed (parts of) :class:`Panel` (:issue:`25047`,:issue:`25191`,:issue:`25231`)
 -
 -
 -
@@ -71,6 +68,8 @@ Performance Improvements
 Bug Fixes
 ~~~~~~~~~
 
+-
+
 Categorical
 ^^^^^^^^^^^
 
@@ -96,7 +95,7 @@ Timezones
 ^^^^^^^^^
 
 - Bug in :func:`to_datetime` with ``utc=True`` and datetime strings that would apply previously parsed UTC offsets to subsequent arguments (:issue:`24992`)
--
+- Bug in :func:`Timestamp.tz_localize` and :func:`Timestamp.tz_convert` does not propagate ``freq`` (:issue:`25241`)
 -
 
 Numeric
@@ -142,7 +141,7 @@ Indexing
 Missing
 ^^^^^^^
 
--
+- Fixed misleading exception message in :meth:`Series.missing` if argument ``order`` is required, but omitted (:issue:`10633`, :issue:`24014`).
 -
 -
 

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -2,6 +2,7 @@
 
 import cython
 from cython import Py_ssize_t
+from cython cimport floating
 
 from libc.stdlib cimport malloc, free
 
@@ -382,5 +383,55 @@ def group_any_all(uint8_t[:] out,
                 out[lab] = flag_val
 
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def _group_add(floating[:, :] out,
+               int64_t[:] counts,
+               floating[:, :] values,
+               const int64_t[:] labels,
+               Py_ssize_t min_count=0):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        floating val, count
+        ndarray[floating, ndim=2] sumx, nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+
+    N, K = (<object>values).shape
+
+    with nogil:
+
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    sumx[lab, j] += val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] < min_count:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = sumx[i, j]
+
+
+group_add_float32 = _group_add['float']
+group_add_float64 = _group_add['double']
+
 # generated from template
 include "groupby_helper.pxi"
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
@@ -9,7 +9,7 @@ cdef extern from "numpy/npy_math.h":
 _int64_max = np.iinfo(np.int64).max
 
 # ----------------------------------------------------------------------
-# group_add, group_prod, group_var, group_mean, group_ohlc
+# group_prod, group_var, group_mean, group_ohlc
 # ----------------------------------------------------------------------
 
 {{py:
@@ -27,53 +27,6 @@ def get_dispatch(dtypes):
 {{for name, c_type in get_dispatch(dtypes)}}
 
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_add_{{name}}({{c_type}}[:, :] out,
-                       int64_t[:] counts,
-                       {{c_type}}[:, :] values,
-                       const int64_t[:] labels,
-                       Py_ssize_t min_count=0):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        {{c_type}} val, count
-        ndarray[{{c_type}}, ndim=2] sumx, nobs
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-
-    N, K = (<object>values).shape
-
-    with nogil:
-
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    sumx[lab, j] += val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] < min_count:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = sumx[i, j]
-
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_prod_{{name}}({{c_type}}[:, :] out,

diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
@@ -1127,10 +1127,11 @@ class Timedelta(_Timedelta):
         'ms', 'milliseconds', 'millisecond', 'milli', 'millis', 'L',
         'us', 'microseconds', 'microsecond', 'micro', 'micros', 'U',
         'ns', 'nanoseconds', 'nano', 'nanos', 'nanosecond', 'N'}
-    days, seconds, microseconds,
-    milliseconds, minutes, hours, weeks : numeric, optional
+    **kwargs
+        Available kwargs: {days, seconds, microseconds,
+        milliseconds, minutes, hours, weeks}.
         Values for construction in compat with datetime.timedelta.
-        np ints and floats will be coerced to python ints and floats.
+        Numpy ints and floats will be coerced to python ints and floats.
 
     Notes
     -----
@@ -1158,6 +1159,11 @@ class Timedelta(_Timedelta):
                                  "[weeks, days, hours, minutes, seconds, "
                                  "milliseconds, microseconds, nanoseconds]")
 
+        if unit in {'Y', 'y', 'M'}:
+            warnings.warn("M and Y units are deprecated and "
+                          "will be removed in a future version.",
+                          FutureWarning, stacklevel=1)
+
         if isinstance(value, Timedelta):
             value = value.value
         elif is_string_object(value):

diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx
@@ -1187,12 +1187,12 @@ class Timestamp(_Timestamp):
             value = tz_localize_to_utc(np.array([self.value], dtype='i8'), tz,
                                        ambiguous=ambiguous,
                                        nonexistent=nonexistent)[0]
-            return Timestamp(value, tz=tz)
+            return Timestamp(value, tz=tz, freq=self.freq)
         else:
             if tz is None:
                 # reset tz
                 value = tz_convert_single(self.value, UTC, self.tz)
-                return Timestamp(value, tz=None)
+                return Timestamp(value, tz=tz, freq=self.freq)
             else:
                 raise TypeError('Cannot localize tz-aware Timestamp, use '
                                 'tz_convert for conversions')
@@ -1222,7 +1222,7 @@ class Timestamp(_Timestamp):
                             'tz_localize to localize')
         else:
             # Same UTC timestamp, different time zone
-            return Timestamp(self.value, tz=tz)
+            return Timestamp(self.value, tz=tz, freq=self.freq)
 
     astimezone = tz_convert
 

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -566,7 +566,7 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
         coerced to ndarrays before factorization.
     """),
     order=dedent("""\
-    order
+    order : None
         .. deprecated:: 0.23.0
 
            This parameter has no effect and is deprecated.

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2167,8 +2167,7 @@ def _reverse_indexer(self):
         r, counts = libalgos.groupsort_indexer(self.codes.astype('int64'),
                                                categories.size)
         counts = counts.cumsum()
-        result = [r[counts[indexer]:counts[indexer + 1]]
-                  for indexer in range(len(counts) - 1)]
+        result = (r[start:end] for start, end in zip(counts, counts[1:]))
         result = dict(zip(categories, result))
         return result
 

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -128,7 +128,7 @@ def _dt_array_cmp(cls, op):
     Wrap comparison operations to convert datetime-like to datetime64
     """
     opname = '__{name}__'.format(name=op.__name__)
-    nat_result = True if opname == '__ne__' else False
+    nat_result = opname == '__ne__'
 
     def wrapper(self, other):
         if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -561,7 +561,7 @@ def cmp_method(self, other):
             else:
                 mask = self._mask | mask
 
-            result[mask] = True if op_name == 'ne' else False
+            result[mask] = op_name == 'ne'
             return result
 
         name = '__{name}__'.format(name=op.__name__)

diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
@@ -46,7 +46,7 @@ def _period_array_cmp(cls, op):
     Wrap comparison operations to convert Period-like to PeriodDtype
     """
     opname = '__{name}__'.format(name=op.__name__)
-    nat_result = True if opname == '__ne__' else False
+    nat_result = opname == '__ne__'
 
     def wrapper(self, other):
         op = getattr(self.asi8, opname)

diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -62,7 +62,7 @@ def _td_array_cmp(cls, op):
     Wrap comparison operations to convert timedelta-like to timedelta64
     """
     opname = '__{name}__'.format(name=op.__name__)
-    nat_result = True if opname == '__ne__' else False
+    nat_result = opname == '__ne__'
 
     def wrapper(self, other):
         if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):

diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py
@@ -252,7 +252,7 @@ def evaluate(self):
                              .format(slf=self))
 
         rhs = self.conform(self.rhs)
-        values = [TermValue(v, v, self.kind) for v in rhs]
+        values = [TermValue(v, v, self.kind).value for v in rhs]
 
         if self.is_in_table:
 
@@ -263,7 +263,7 @@ def evaluate(self):
                 self.filter = (
                     self.lhs,
                     filter_op,
-                    pd.Index([v.value for v in values]))
+                    pd.Index(values))
 
                 return self
             return None
@@ -275,7 +275,7 @@ def evaluate(self):
             self.filter = (
                 self.lhs,
                 filter_op,
-                pd.Index([v.value for v in values]))
+                pd.Index(values))
 
         else:
             raise TypeError("passing a filterable condition to a non-table "