Merge remote-tracking branch 'upstream/master' into Rt05

* upstream/master: DOC: CategoricalIndex doc string (pandas-dev#24852) CI: add __init__.py to isort skip list (pandas-dev#25455) TST: numpy RuntimeWarning with Series.round() (pandas-dev#25432) DOC: fixed geo accessor example in extending.rst (pandas-dev#25420) BUG: fixed merging with empty frame containing an Int64 column (pandas-dev#25183) (pandas-dev#25289) TST: remove never-used singleton fixtures (pandas-dev#24885) PERF/REF: improve performance of Series.searchsorted, PandasArray.searchsorted, collect functionality (pandas-dev#22034) BUG: Indexing with UTC offset string no longer ignored (pandas-dev#25263) API/ERR: allow iterators in df.set_index & improve errors (pandas-dev#24984) DOC: Rewriting of ParserError doc + minor spacing (pandas-dev#25421) ENH: Add in sort keyword to DatetimeIndex.union (pandas-dev#25110) ERR: doc update for ParsingError (pandas-dev#25414) BUG: Fix type coercion in read_json orient='table' (pandas-dev#21345) (pandas-dev#25219) DEP: add pytest-mock to environment.yml (pandas-dev#25417) Correct a typo of version number for interpolate() (pandas-dev#25418) Mark test_pct_max_many_rows as high memory (pandas-dev#25400) DOC: Edited docstring of Interval (pandas-dev#25410)
thoo · Feb 28, 2019 · 2d62018 · 2d62018
2 parents b847e4e + c986386
commit 2d62018
Show file tree

Hide file tree

Showing 53 changed files with 765 additions and 216 deletions.
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
@@ -124,6 +124,25 @@ def time_dropna(self, dtype):
         self.s.dropna()
 
 
+class SearchSorted(object):
+
+    goal_time = 0.2
+    params = ['int8', 'int16', 'int32', 'int64',
+              'uint8', 'uint16', 'uint32', 'uint64',
+              'float16', 'float32', 'float64',
+              'str']
+    param_names = ['dtype']
+
+    def setup(self, dtype):
+        N = 10**5
+        data = np.array([1] * N + [2] * N + [3] * N).astype(dtype)
+        self.s = Series(data)
+
+    def time_searchsorted(self, dtype):
+        key = '2' if dtype == 'str' else 2
+        self.s.searchsorted(key)
+
+
 class Map(object):
 
     params = ['dict', 'Series']

diff --git a/ci/deps/azure-27-compat.yaml b/ci/deps/azure-27-compat.yaml
@@ -21,6 +21,7 @@ dependencies:
   - pytest
   - pytest-xdist
   - pytest-mock
+  - isort
   - pip:
     - html5lib==1.0b2
     - beautifulsoup4==4.2.1

diff --git a/ci/deps/azure-27-locale.yaml b/ci/deps/azure-27-locale.yaml
@@ -24,6 +24,7 @@ dependencies:
   - pytest-xdist
   - pytest-mock
   - hypothesis>=3.58.0
+  - isort
   - pip:
     - html5lib==1.0b2
     - beautifulsoup4==4.2.1
diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml
@@ -30,5 +30,6 @@ dependencies:
   - pytest-xdist
   - pytest-mock
   - moto
+  - isort
   - pip:
     - hypothesis>=3.58.0
diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml
@@ -28,6 +28,7 @@ dependencies:
   - pytest
   - pytest-xdist
   - pytest-mock
+  - isort
   - pip:
     - hypothesis>=3.58.0
     - moto  # latest moto in conda-forge fails with 3.7, move to conda dependencies when this is fixed
diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml
@@ -10,6 +10,7 @@ dependencies:
   - pytest-xdist
   - pytest-mock
   - hypothesis>=3.58.0
+  - isort
   - pip:
     - "git+git://github.com/dateutil/dateutil.git"
     - "-f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com"

diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml
@@ -25,6 +25,7 @@ dependencies:
   - pytest
   - pytest-xdist
   - pytest-mock
+  - isort
   - pip:
     - python-dateutil==2.5.3
     - hypothesis>=3.58.0
diff --git a/ci/deps/azure-windows-27.yaml b/ci/deps/azure-windows-27.yaml
@@ -30,3 +30,4 @@ dependencies:
   - pytest-mock
   - moto
   - hypothesis>=3.58.0
+  - isort
diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml
@@ -27,3 +27,4 @@ dependencies:
   - pytest-xdist
   - pytest-mock
   - hypothesis>=3.58.0
+  - isort
diff --git a/ci/deps/travis-27.yaml b/ci/deps/travis-27.yaml
@@ -44,6 +44,7 @@ dependencies:
   - pytest-mock
   - moto==1.3.4
   - hypothesis>=3.58.0
+  - isort
   - pip:
     - backports.lzma
     - pandas-gbq

diff --git a/ci/deps/travis-36-doc.yaml b/ci/deps/travis-36-doc.yaml
@@ -43,3 +43,4 @@ dependencies:
   # universal
   - pytest
   - pytest-xdist
+  - isort
diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml
@@ -32,5 +32,6 @@ dependencies:
   - pytest-xdist
   - pytest-mock
   - moto
+  - isort
   - pip:
     - hypothesis>=3.58.0
diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml
@@ -30,3 +30,4 @@ dependencies:
   - pytest-mock
   - moto
   - hypothesis>=3.58.0
+  - isort
diff --git a/ci/deps/travis-36.yaml b/ci/deps/travis-36.yaml
@@ -38,6 +38,7 @@ dependencies:
   - pytest-cov
   - pytest-mock
   - hypothesis>=3.58.0
+  - isort
   - pip:
     - brotlipy
     - coverage

diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml
@@ -17,5 +17,6 @@ dependencies:
   - pytest-mock
   - hypothesis>=3.58.0
   - s3fs
+  - isort
   - pip:
     - moto
diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst
@@ -33,8 +33,9 @@ decorate a class, providing the name of attribute to add. The class's
 
        @staticmethod
        def _validate(obj):
-           if 'lat' not in obj.columns or 'lon' not in obj.columns:
-               raise AttributeError("Must have 'lat' and 'lon'.")
+           # verify there is a column latitude and a column longitude
+           if 'latitude' not in obj.columns or 'longitude' not in obj.columns:
+               raise AttributeError("Must have 'latitude' and 'longitude'.")
 
        @property
        def center(self):

diff --git a/doc/source/styled.xlsx b/doc/source/styled.xlsx
diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst
@@ -335,7 +335,7 @@ examined :ref:`in the API <api.dataframe.missing>`.
 Interpolation
 ~~~~~~~~~~~~~
 
-.. versionadded:: 0.21.0
+.. versionadded:: 0.23.0
 
   The ``limit_area`` keyword argument was added.
 

diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
@@ -633,6 +633,16 @@ We are stopping on the included end-point as it is part of the index:
    dft2 = dft2.swaplevel(0, 1).sort_index()
    dft2.loc[idx[:, '2013-01-05'], :]
 
+.. versionadded:: 0.25.0
+
+Slicing with string indexing also honors UTC offset.
+
+.. ipython:: python
+
+    df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific'))
+    df
+    df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00']
+
 .. _timeseries.slice_vs_exact_match:
 
 Slice vs. Exact Match

diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst
@@ -96,7 +96,7 @@ Bug Fixes
 **Other**
 
 - Bug in :meth:`Series.is_unique` where single occurrences of ``NaN`` were not considered unique (:issue:`25180`)
--
+- Bug in :func:`merge` when merging an empty ``DataFrame`` with an ``Int64`` column or a non-empty ``DataFrame`` with an ``Int64`` column that is all ``NaN`` (:issue:`25183`)
 -
 
 .. _whatsnew_0.242.contributors:

diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -22,14 +22,46 @@ Other Enhancements
 - Indexing of ``DataFrame`` and ``Series`` now accepts zerodim ``np.ndarray`` (:issue:`24919`)
 - :meth:`Timestamp.replace` now supports the ``fold`` argument to disambiguate DST transition times (:issue:`25017`)
 - :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :meth:`datetime.time` objects with timezones (:issue:`24043`)
+- :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`)
+- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`)
 -
 
 .. _whatsnew_0250.api_breaking:
 
 Backwards incompatible API changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-- :meth:`Timestamp.strptime` will now raise a NotImplementedError (:issue:`25016`)
+.. _whatsnew_0250.api_breaking.utc_offset_indexing:
+
+Indexing with date strings with UTC offsets
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Indexing a :class:`DataFrame` or :class:`Series` with a :class:`DatetimeIndex` with a
+date string with a UTC offset would previously ignore the UTC offset. Now, the UTC offset
+is respected in indexing. (:issue:`24076`, :issue:`16785`)
+
+*Previous Behavior*:
+
+.. code-block:: ipython
+
+    In [1]: df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific'))
+
+    In [2]: df
+    Out[2]:
+                               0
+    2019-01-01 00:00:00-08:00  0
+
+    In [3]: df['2019-01-01 00:00:00+04:00':'2019-01-01 01:00:00+04:00']
+    Out[3]:
+                               0
+    2019-01-01 00:00:00-08:00  0
+
+*New Behavior*:
+
+.. ipython:: ipython
+
+    df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific'))
+    df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00']
 
 .. _whatsnew_0250.api.other:
 
@@ -38,7 +70,7 @@ Other API Changes
 
 - :class:`DatetimeTZDtype` will now standardize pytz timezones to a common timezone instance (:issue:`24713`)
 - ``Timestamp`` and ``Timedelta`` scalars now implement the :meth:`to_numpy` method as aliases to :meth:`Timestamp.to_datetime64` and :meth:`Timedelta.to_timedelta64`, respectively. (:issue:`24653`)
--
+- :meth:`Timestamp.strptime` will now rise a ``NotImplementedError`` (:issue:`25016`)
 -
 
 .. _whatsnew_0250.deprecations:
@@ -64,7 +96,8 @@ Performance Improvements
 
 - Significant speedup in `SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`)
 - `DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`)
--
+- Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is
+  int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`)
 
 
 .. _whatsnew_0250.bug_fixes:
@@ -160,6 +193,7 @@ I/O
 ^^^
 
 - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`)
+- Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`)
 -
 -
 -

diff --git a/environment.yml b/environment.yml
@@ -20,6 +20,7 @@ dependencies:
   - isort
   - moto
   - pytest>=4.0
+  - pytest-mock
   - sphinx
   - numpydoc
 

diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx
@@ -150,9 +150,6 @@ cdef class Interval(IntervalMixin):
         Left bound for the interval.
     right : orderable scalar
         Right bound for the interval.
-    closed : {'left', 'right', 'both', 'neither'}, default 'right'
-        Whether the interval is closed on the left-side, right-side, both or
-        neither.
     closed : {'right', 'left', 'both', 'neither'}, default 'right'
         Whether the interval is closed on the left-side, right-side, both or
         neither. See the Notes for more detailed explanation.

diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
@@ -137,6 +137,7 @@ def lfilter(*args, **kwargs):
     reload = reload
     Hashable = collections.abc.Hashable
     Iterable = collections.abc.Iterable
+    Iterator = collections.abc.Iterator
     Mapping = collections.abc.Mapping
     MutableMapping = collections.abc.MutableMapping
     Sequence = collections.abc.Sequence
@@ -199,6 +200,7 @@ def get_range_parameters(data):
 
     Hashable = collections.Hashable
     Iterable = collections.Iterable
+    Iterator = collections.Iterator
     Mapping = collections.Mapping
     MutableMapping = collections.MutableMapping
     Sequence = collections.Sequence

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -19,7 +19,7 @@
     ensure_float64, ensure_int64, ensure_object, ensure_platform_int,
     ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype,
     is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype,
-    is_datetimelike, is_extension_array_dtype, is_float_dtype,
+    is_datetimelike, is_extension_array_dtype, is_float_dtype, is_integer,
     is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype,
     is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype,
     is_sparse, is_timedelta64_dtype, is_unsigned_integer_dtype,
@@ -1729,6 +1729,89 @@ def func(arr, indexer, out, fill_value=np.nan):
     return out
 
 
+# ------------ #
+# searchsorted #
+# ------------ #
+
+def searchsorted(arr, value, side="left", sorter=None):
+    """
+    Find indices where elements should be inserted to maintain order.
+
+    .. versionadded:: 0.25.0
+
+    Find the indices into a sorted array `arr` (a) such that, if the
+    corresponding elements in `value` were inserted before the indices,
+    the order of `arr` would be preserved.
+
+    Assuming that `arr` is sorted:
+
+    ======  ================================
+    `side`  returned index `i` satisfies
+    ======  ================================
+    left    ``arr[i-1] < value <= self[i]``
+    right   ``arr[i-1] <= value < self[i]``
+    ======  ================================
+
+    Parameters
+    ----------
+    arr: array-like
+        Input array. If `sorter` is None, then it must be sorted in
+        ascending order, otherwise `sorter` must be an array of indices
+        that sort it.
+    value : array_like
+        Values to insert into `arr`.
+    side : {'left', 'right'}, optional
+        If 'left', the index of the first suitable location found is given.
+        If 'right', return the last such index.  If there is no suitable
+        index, return either 0 or N (where N is the length of `self`).
+    sorter : 1-D array_like, optional
+        Optional array of integer indices that sort array a into ascending
+        order. They are typically the result of argsort.
+
+    Returns
+    -------
+    array of ints
+        Array of insertion points with the same shape as `value`.
+
+    See Also
+    --------
+    numpy.searchsorted : Similar method from NumPy.
+    """
+    if sorter is not None:
+        sorter = ensure_platform_int(sorter)
+
+    if isinstance(arr, np.ndarray) and is_integer_dtype(arr) and (
+            is_integer(value) or is_integer_dtype(value)):
+        from .arrays.array_ import array
+        # if `arr` and `value` have different dtypes, `arr` would be
+        # recast by numpy, causing a slow search.
+        # Before searching below, we therefore try to give `value` the
+        # same dtype as `arr`, while guarding against integer overflows.
+        iinfo = np.iinfo(arr.dtype.type)
+        value_arr = np.array([value]) if is_scalar(value) else np.array(value)
+        if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all():
+            # value within bounds, so no overflow, so can convert value dtype
+            # to dtype of arr
+            dtype = arr.dtype
+        else:
+            dtype = value_arr.dtype
+
+        if is_scalar(value):
+            value = dtype.type(value)
+        else:
+            value = array(value, dtype=dtype)
+    elif not (is_object_dtype(arr) or is_numeric_dtype(arr) or
+              is_categorical_dtype(arr)):
+        from pandas.core.series import Series
+        # E.g. if `arr` is an array with dtype='datetime64[ns]'
+        # and `value` is a pd.Timestamp, we may need to convert value
+        value_ser = Series(value)._values
+        value = value_ser[0] if is_scalar(value) else value_ser
+
+    result = arr.searchsorted(value, side=side, sorter=sorter)
+    return result
+
+
 # ---- #
 # diff #
 # ---- #