Merge remote-tracking branch 'upstream/master' into bug/categorical-i…

…ndexing-1row-df * upstream/master: (333 commits) CI: troubleshoot Web_and_Docs failing (pandas-dev#30534) WARN: Ignore NumbaPerformanceWarning in test suite (pandas-dev#30525) DEPR: camelCase in offsets, get_offset (pandas-dev#30340) PERF: implement scalar ops blockwise (pandas-dev#29853) DEPR: Remove Series.compress (pandas-dev#30514) ENH: Add numba engine for rolling apply (pandas-dev#30151) [ENH] Add to_markdown method (pandas-dev#30350) DEPR: Deprecate pandas.np module (pandas-dev#30386) ENH: Add ignore_index for df.drop_duplicates (pandas-dev#30405) BUG: The setting xrot=0 in DataFrame.hist() doesn't work with by and subplots pandas-dev#30288 (pandas-dev#30491) CI: Fix GBQ Tests (pandas-dev#30478) Bug groupby quantile listlike q and int columns (pandas-dev#30485) ENH: Add ignore_index for df.sort_values and series.sort_values (pandas-dev#30402) TYP: Typing hints in pandas/io/formats/{css,csvs}.py (pandas-dev#30398) BUG: raise on non-hashable Index name, closes pandas-dev#29069 (pandas-dev#30335) Replace "foo!r" to "repr(foo)" syntax pandas-dev#29886 (pandas-dev#30502) BUG: preserve EA dtype in transpose (pandas-dev#30091) BLD: add check to prevent tempita name error, clsoes pandas-dev#28836 (pandas-dev#30498) REF/TST: method-specific files for test_append (pandas-dev#30503) marked unused parameters (pandas-dev#30504) ...
keechongtan · Dec 29, 2019 · c5a7f6e · c5a7f6e
2 parents 5512119 + e817fff
commit c5a7f6e
Show file tree

Hide file tree

Showing 595 changed files with 18,013 additions and 22,982 deletions.
diff --git a/.binstar.yml b/.binstar.yml
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -15,12 +15,12 @@ jobs:
     runs-on: ubuntu-latest
     steps:
 
+    - name: Setting conda path
+      run: echo "::add-path::${HOME}/miniconda3/bin"
+
     - name: Checkout
       uses: actions/checkout@v1
 
-    - name: Setting conda path
-      run: echo "::set-env name=PATH::${HOME}/miniconda3/bin:${PATH}"
-
     - name: Looking for unwanted patterns
       run: ci/code_checks.sh patterns
       if: true

diff --git a/.travis.yml b/.travis.yml
@@ -30,31 +30,34 @@ matrix:
       - python: 3.5
 
     include:
-    - dist: trusty
-      env:
+    - env:
         - JOB="3.8" ENV_FILE="ci/deps/travis-38.yaml" PATTERN="(not slow and not network)"
 
-    - dist: trusty
-      env:
+    - env:
         - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" PATTERN="(not slow and not network)"
 
-    - dist: trusty
-      env:
-        - JOB="3.6, locale" ENV_FILE="ci/deps/travis-36-locale.yaml" PATTERN="((not slow and not network) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8"
+    - env:
+        - JOB="3.6, locale" ENV_FILE="ci/deps/travis-36-locale.yaml" PATTERN="((not slow and not network) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1"
+      services:
+        - mysql
+        - postgresql
 
-    - dist: trusty
-      env:
-        - JOB="3.6, coverage" ENV_FILE="ci/deps/travis-36-cov.yaml" PATTERN="((not slow and not network) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true
+    - env:
+        - JOB="3.6, coverage" ENV_FILE="ci/deps/travis-36-cov.yaml" PATTERN="((not slow and not network) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1"
+      services:
+        - mysql
+        - postgresql
 
     # In allow_failures
-    - dist: trusty
-      env:
-        - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow"
+    - env:
+        - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" SQL="1"
+      services:
+        - mysql
+        - postgresql
 
     allow_failures:
-      - dist: trusty
-        env:
-          - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow"
+      - env:
+          - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" SQL="1"
 
 before_install:
   - echo "before_install"

diff --git a/LICENSES/MSGPACK_LICENSE b/LICENSES/MSGPACK_LICENSE
diff --git a/LICENSES/MSGPACK_NUMPY_LICENSE b/LICENSES/MSGPACK_NUMPY_LICENSE
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -20,7 +20,6 @@ global-exclude *.gz
 global-exclude *.h5
 global-exclude *.html
 global-exclude *.json
-global-exclude *.msgpack
 global-exclude *.pickle
 global-exclude *.png
 global-exclude *.pyc

diff --git a/README.md b/README.md
@@ -124,7 +124,7 @@ Here are just a few of the things that pandas does well:
     and saving/loading data from the ultrafast [**HDF5 format**][hdfstore]
   - [**Time series**][timeseries]-specific functionality: date range
     generation and frequency conversion, moving window statistics,
-    moving window linear regressions, date shifting and lagging, etc.
+    date shifting and lagging.
 
 
    [missing-data]: https://pandas.pydata.org/pandas-docs/stable/missing_data.html#working-with-missing-data

diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py
@@ -0,0 +1,23 @@
+import numpy as np
+
+import pandas as pd
+
+
+class BooleanArray:
+    def setup(self):
+        self.values_bool = np.array([True, False, True, False])
+        self.values_float = np.array([1.0, 0.0, 1.0, 0.0])
+        self.values_integer = np.array([1, 0, 1, 0])
+        self.values_integer_like = [1, 0, 1, 0]
+
+    def time_from_bool_array(self):
+        pd.array(self.values_bool, dtype="boolean")
+
+    def time_from_integer_array(self):
+        pd.array(self.values_integer, dtype="boolean")
+
+    def time_from_integer_like(self):
+        pd.array(self.values_integer_like, dtype="boolean")
+
+    def time_from_float_array(self):
+        pd.array(self.values_float, dtype="boolean")
diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py
@@ -1,3 +1,5 @@
+import operator
+
 import numpy as np
 
 from pandas import DataFrame, Series, date_range
@@ -9,6 +11,36 @@
     import pandas.computation.expressions as expr
 
 
+class IntFrameWithScalar:
+    params = [
+        [np.float64, np.int64],
+        [2, 3.0, np.int32(4), np.float64(5)],
+        [
+            operator.add,
+            operator.sub,
+            operator.mul,
+            operator.truediv,
+            operator.floordiv,
+            operator.pow,
+            operator.mod,
+            operator.eq,
+            operator.ne,
+            operator.gt,
+            operator.ge,
+            operator.lt,
+            operator.le,
+        ],
+    ]
+    param_names = ["dtype", "scalar", "op"]
+
+    def setup(self, dtype, scalar, op):
+        arr = np.random.randn(20000, 100)
+        self.df = DataFrame(arr.astype(dtype))
+
+    def time_frame_op_with_scalar(self, dtype, scalar, op):
+        op(self.df, scalar)
+
+
 class Ops:
 
     params = [[True, False], ["default", 1]]

diff --git a/asv_bench/benchmarks/boolean.py b/asv_bench/benchmarks/boolean.py
@@ -0,0 +1,32 @@
+import numpy as np
+
+import pandas as pd
+
+
+class TimeLogicalOps:
+    def setup(self):
+        N = 10_000
+        left, right, lmask, rmask = np.random.randint(0, 2, size=(4, N)).astype("bool")
+        self.left = pd.arrays.BooleanArray(left, lmask)
+        self.right = pd.arrays.BooleanArray(right, rmask)
+
+    def time_or_scalar(self):
+        self.left | True
+        self.left | False
+
+    def time_or_array(self):
+        self.left | self.right
+
+    def time_and_scalar(self):
+        self.left & True
+        self.left & False
+
+    def time_and_array(self):
+        self.left & self.right
+
+    def time_xor_scalar(self):
+        self.left ^ True
+        self.left ^ False
+
+    def time_xor_array(self):
+        self.left ^ self.right
diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py
@@ -5,6 +5,7 @@
 from .pandas_vb_common import (
     datetime_dtypes,
     extension_dtypes,
+    lib,
     numeric_dtypes,
     string_dtypes,
 )
@@ -40,4 +41,25 @@ def time_pandas_dtype_invalid(self, dtype):
             pass
 
 
+class InferDtypes:
+    param_names = ["dtype"]
+    data_dict = {
+        "np-object": np.array([1] * 100000, dtype="O"),
+        "py-object": [1] * 100000,
+        "np-null": np.array([1] * 50000 + [np.nan] * 50000),
+        "py-null": [1] * 50000 + [None] * 50000,
+        "np-int": np.array([1] * 100000, dtype=int),
+        "np-floating": np.array([1.0] * 100000, dtype=float),
+        "empty": [],
+        "bytes": [b"a"] * 100000,
+    }
+    params = list(data_dict.keys())
+
+    def time_infer_skipna(self, dtype):
+        lib.infer_dtype(self.data_dict[dtype], skipna=True)
+
+    def time_infer(self, dtype):
+        lib.infer_dtype(self.data_dict[dtype], skipna=False)
+
+
 from .pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py
@@ -105,4 +105,16 @@ def time_frame_from_lists(self):
         self.df = DataFrame(self.data)
 
 
+class FromRange:
+
+    goal_time = 0.2
+
+    def setup(self):
+        N = 1_000_000
+        self.data = range(N)
+
+    def time_frame_from_range(self):
+        self.df = DataFrame(self.data)
+
+
 from .pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -321,10 +321,9 @@ class Dropna:
 
     def setup(self, how, axis):
         self.df = DataFrame(np.random.randn(10000, 1000))
-        with warnings.catch_warnings(record=True):
-            self.df.ix[50:1000, 20:50] = np.nan
-            self.df.ix[2000:3000] = np.nan
-            self.df.ix[:, 60:70] = np.nan
+        self.df.iloc[50:1000, 20:50] = np.nan
+        self.df.iloc[2000:3000] = np.nan
+        self.df.iloc[:, 60:70] = np.nan
         self.df_mixed = self.df.copy()
         self.df_mixed["foo"] = "bar"
 
@@ -342,10 +341,9 @@ class Count:
 
     def setup(self, axis):
         self.df = DataFrame(np.random.randn(10000, 1000))
-        with warnings.catch_warnings(record=True):
-            self.df.ix[50:1000, 20:50] = np.nan
-            self.df.ix[2000:3000] = np.nan
-            self.df.ix[:, 60:70] = np.nan
+        self.df.iloc[50:1000, 20:50] = np.nan
+        self.df.iloc[2000:3000] = np.nan
+        self.df.iloc[:, 60:70] = np.nan
         self.df_mixed = self.df.copy()
         self.df_mixed["foo"] = "bar"
 

diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py
@@ -7,6 +7,7 @@
     Float64Index,
     Index,
     IntervalIndex,
+    MultiIndex,
     RangeIndex,
     Series,
     date_range,
@@ -111,6 +112,18 @@ def time_get_loc_dec(self):
         self.idx_dec.get_loc(100000)
 
 
+class IndexEquals:
+    def setup(self):
+        idx_large_fast = RangeIndex(100000)
+        idx_small_slow = date_range(start="1/1/2012", periods=1)
+        self.mi_large_slow = MultiIndex.from_product([idx_large_fast, idx_small_slow])
+
+        self.idx_non_object = RangeIndex(1)
+
+    def time_non_object_equals_multiindex(self):
+        self.idx_non_object.equals(self.mi_large_slow)
+
+
 class IndexAppend:
     def setup(self):