Merge remote-tracking branch 'upstream/master' into bug/categorical-i…

…ndexing-1row-df * upstream/master: (49 commits) repr() (pandas-dev#29959) DOC : Typo fix in userguide/Styling (pandas-dev#29956) CLN: small things in pytables (pandas-dev#29958) API/DEPR: Change default skipna behaviour + deprecate numeric_only in Categorical.min and max (pandas-dev#27929) DEPR: DTI/TDI/PI constructor arguments (pandas-dev#29930) CLN: fix pytables passing too many kwargs (pandas-dev#29951) Typing (pandas-dev#29947) repr() (pandas-dev#29948) repr() (pandas-dev#29950) Added space at the end of the sentence (pandas-dev#29949) ENH: add NA scalar for missing value indicator, use in StringArray. (pandas-dev#29597) CLN: BlockManager.apply (pandas-dev#29825) TST: add test for rolling max/min/mean with DatetimeIndex over different frequencies (pandas-dev#29932) CLN: explicit signature for to_hdf (pandas-dev#29939) CLN: make kwargs explicit for pytables read_ methods (pandas-dev#29935) Convert core/indexes/base.py to f-strings (pandas-dev#29903) DEPR: dropna multiple axes, fillna int for td64, from_codes with floats, Series.nonzero (pandas-dev#29875) CLN: make kwargs explicit in pytables constructors (pandas-dev#29936) DEPR: tz_convert in the Timestamp constructor raises (pandas-dev#29929) STY: F-strings and repr (pandas-dev#29938) ...
keechongtan · Dec 2, 2019 · 593dda1 · 593dda1
2 parents 4257fe8 + 0c2b1db
commit 593dda1
Show file tree

Hide file tree

Showing 159 changed files with 1,831 additions and 2,645 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -80,15 +80,18 @@ jobs:
         git fetch upstream
         if git diff upstream/master --name-only | grep -q "^asv_bench/"; then
             asv machine --yes
-            ASV_OUTPUT="$(asv dev)"
-            if [[ $(echo "$ASV_OUTPUT" | grep "failed") ]]; then
-                echo "##vso[task.logissue type=error]Benchmarks run with errors"
-                echo "$ASV_OUTPUT"
+            asv dev | sed "/failed$/ s/^/##[error]/" | tee benchmarks.log
+            if grep "failed" benchmarks.log > /dev/null ; then
                 exit 1
-            else
-                echo "Benchmarks run without errors"
             fi
         else
             echo "Benchmarks did not run, no changes detected"
         fi
       if: true
+
+    - name: Publish benchmarks artifact
+      uses: actions/upload-artifact@master
+      with:
+        name: Benchmarks log
+        path: asv_bench/benchmarks.log
+      if: failure()
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -565,7 +565,7 @@ def setup(self):
 
     def time_frame_get_dtype_counts(self):
         with warnings.catch_warnings(record=True):
-            self.df.get_dtype_counts()
+            self.df._data.get_dtype_counts()
 
     def time_info(self):
         self.df.info()

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -34,17 +34,13 @@ function invgrep {
     #
     # This is useful for the CI, as we want to fail if one of the patterns
     # that we want to avoid is found by grep.
-    if [[ "$AZURE" == "true" ]]; then
-        set -o pipefail
-        grep -n "$@" | awk -F ":" '{print "##vso[task.logissue type=error;sourcepath=" $1 ";linenumber=" $2 ";] Found unwanted pattern: " $3}'
-    else
-        grep "$@"
-    fi
-    return $((! $?))
+    grep -n "$@" | sed "s/^/$INVGREP_PREPEND/" | sed "s/$/$INVGREP_APPEND/" ; EXIT_STATUS=${PIPESTATUS[0]}
+    return $((! $EXIT_STATUS))
 }
 
-if [[ "$AZURE" == "true" ]]; then
-    FLAKE8_FORMAT="##vso[task.logissue type=error;sourcepath=%(path)s;linenumber=%(row)s;columnnumber=%(col)s;code=%(code)s;]%(text)s"
+if [[ "$GITHUB_ACTIONS" == "true" ]]; then
+    FLAKE8_FORMAT="##[error]%(path)s:%(row)s:%(col)s:%(code):%(text)s"
+    INVGREP_PREPEND="##[error]"
 else
     FLAKE8_FORMAT="default"
 fi
@@ -198,15 +194,15 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then
     invgrep -R --include="*.py" -P '# type: (?!ignore)' pandas
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
+    MSG='Check for use of foo.__class__ instead of type(foo)' ; echo $MSG
+    invgrep -R --include=*.{py,pyx} '\.__class__' pandas
+    RET=$(($RET + $?)) ; echo $MSG "DONE"
+
     MSG='Check that no file in the repo contains trailing whitespaces' ; echo $MSG
-    set -o pipefail
-    if [[ "$AZURE" == "true" ]]; then
-        # we exclude all c/cpp files as the c/cpp files of pandas code base are tested when Linting .c and .h files
-        ! grep -n '--exclude=*.'{svg,c,cpp,html,js} --exclude-dir=env -RI "\s$" * | awk -F ":" '{print "##vso[task.logissue type=error;sourcepath=" $1 ";linenumber=" $2 ";] Tailing whitespaces found: " $3}'
-    else
-        ! grep -n '--exclude=*.'{svg,c,cpp,html,js} --exclude-dir=env -RI "\s$" * | awk -F ":" '{print $1 ":" $2 ":Tailing whitespaces found: " $3}'
-    fi
+    INVGREP_APPEND=" <- trailing whitespaces found"
+    invgrep -RI --exclude=\*.{svg,c,cpp,html,js} --exclude-dir=env "\s$" *
     RET=$(($RET + $?)) ; echo $MSG "DONE"
+    unset INVGREP_APPEND
 fi
 
 ### CODE ###

diff --git a/doc/redirects.csv b/doc/redirects.csv
diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst
@@ -2006,7 +2006,7 @@ The number of columns of each type in a ``DataFrame`` can be found by calling
 
 Numeric dtypes will propagate and can coexist in DataFrames.
 If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``,
-or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore,
+or a passed ``Series``), then it will be preserved in DataFrame operations. Furthermore,
 different numeric dtypes will **NOT** be combined. The following example will give you a taste.
 
 .. ipython:: python

diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -28,7 +28,6 @@ Attributes and underlying data
    :toctree: api/
 
    DataFrame.dtypes
-   DataFrame.get_dtype_counts
    DataFrame.select_dtypes
    DataFrame.values
    DataFrame.get_values
@@ -363,7 +362,6 @@ Serialization / IO / conversion
    DataFrame.to_msgpack
    DataFrame.to_gbq
    DataFrame.to_records
-   DataFrame.to_dense
    DataFrame.to_string
    DataFrame.to_clipboard
    DataFrame.style
diff --git a/doc/source/reference/indexing.rst b/doc/source/reference/indexing.rst
@@ -32,7 +32,6 @@ Properties
    Index.has_duplicates
    Index.hasnans
    Index.dtype
-   Index.dtype_str
    Index.inferred_type
    Index.is_all_dates
    Index.shape
@@ -42,9 +41,6 @@ Properties
    Index.ndim
    Index.size
    Index.empty
-   Index.strides
-   Index.itemsize
-   Index.base
    Index.T
    Index.memory_usage
 

diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst
@@ -33,16 +33,11 @@ Attributes
    Series.nbytes
    Series.ndim
    Series.size
-   Series.strides
-   Series.itemsize
-   Series.base
    Series.T
    Series.memory_usage
    Series.hasnans
-   Series.flags
    Series.empty
    Series.dtypes
-   Series.data
    Series.name
    Series.put
 
@@ -584,7 +579,6 @@ Serialization / IO / conversion
    Series.to_sql
    Series.to_msgpack
    Series.to_json
-   Series.to_dense
    Series.to_string
    Series.to_clipboard
    Series.to_latex
diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
@@ -374,7 +374,7 @@ For getting values with a boolean array:
    df1.loc['a'] > 0
    df1.loc[:, df1.loc['a'] > 0]
 
-For getting a value explicitly (equivalent to deprecated ``df.get_value('a','A')``):
+For getting a value explicitly:
 
 .. ipython:: python
 

diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst
@@ -12,10 +12,10 @@ pandas.
 .. note::
 
     The choice of using ``NaN`` internally to denote missing data was largely
-    for simplicity and performance reasons. It differs from the MaskedArray
-    approach of, for example, :mod:`scikits.timeseries`. We are hopeful that
-    NumPy will soon be able to provide a native NA type solution (similar to R)
-    performant enough to be used in pandas.
+    for simplicity and performance reasons.
+    Starting from pandas 1.0, some optional data types start experimenting
+    with a native ``NA`` scalar using a mask-based approach. See
+    :ref:`here <missing_data.NA>` for more.
 
 See the :ref:`cookbook<cookbook.missing_data>` for some advanced strategies.
 
@@ -110,7 +110,7 @@ pandas objects provide compatibility between ``NaT`` and ``NaN``.
 .. _missing.inserting:
 
 Inserting missing data
-----------------------
+~~~~~~~~~~~~~~~~~~~~~~
 
 You can insert missing values by simply assigning to containers. The
 actual missing value used will be chosen based on the dtype.
@@ -135,9 +135,10 @@ For object containers, pandas will use the value given:
    s.loc[1] = np.nan
    s
 
+.. _missing_data.calculations:
 
 Calculations with missing data
-------------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Missing values propagate naturally through arithmetic operations between pandas
 objects.
@@ -771,3 +772,139 @@ the ``dtype="Int64"``.
    s
 
 See :ref:`integer_na` for more.
+
+
+.. _missing_data.NA:
+
+Experimental ``NA`` scalar to denote missing values
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. warning::
+
+   Experimental: the behaviour of ``pd.NA`` can still change without warning.
+
+.. versionadded:: 1.0.0
+
+Starting from pandas 1.0, an experimental ``pd.NA`` value (singleton) is
+available to represent scalar missing values. At this moment, it is used in
+the nullable :doc:`integer <integer_na>`, boolean and
+:ref:`dedicated string <text.types>` data types as the missing value indicator.
+
+The goal of ``pd.NA`` is provide a "missing" indicator that can be used
+consistently accross data types (instead of ``np.nan``, ``None`` or ``pd.NaT``
+depending on the data type).
+
+For example, when having missing values in a Series with the nullable integer
+dtype, it will use ``pd.NA``:
+
+.. ipython:: python
+
+    s = pd.Series([1, 2, None], dtype="Int64")
+    s
+    s[2]
+    s[2] is pd.NA
+
+Currently, pandas does not yet use those data types by default (when creating
+a DataFrame or Series, or when reading in data), so you need to specify
+the dtype explicitly.
+
+Propagation in arithmetic and comparison operations
+---------------------------------------------------
+
+In general, missing values *propagate* in operations involving ``pd.NA``. When
+one of the operands is unknown, the outcome of the operation is also unknown.
+
+For example, ``pd.NA`` propagates in arithmetic operations, similarly to
+``np.nan``:
+
+.. ipython:: python
+
+   pd.NA + 1
+   "a" * pd.NA
+
+In equality and comparison operations, ``pd.NA`` also propagates. This deviates
+from the behaviour of ``np.nan``, where comparisons with ``np.nan`` always
+return ``False``.
+
+.. ipython:: python
+
+   pd.NA == 1
+   pd.NA == pd.NA
+   pd.NA < 2.5
+
+To check if a value is equal to ``pd.NA``, the :func:`isna` function can be
+used:
+
+.. ipython:: python
+
+   pd.isna(pd.NA)
+
+An exception on this basic propagation rule are *reductions* (such as the
+mean or the minimum), where pandas defaults to skipping missing values. See
+:ref:`above <missing_data.calculations>` for more.
+
+Logical operations
+------------------
+
+For logical operations, ``pd.NA`` follows the rules of the
+`three-valued logic <https://en.wikipedia.org/wiki/Three-valued_logic>`__ (or
+*Kleene logic*, similarly to R, SQL and Julia). This logic means to only
+propagate missing values when it is logically required.
+
+For example, for the logical "or" operation (``|``), if one of the operands
+is ``True``, we already know the result will be ``True``, regardless of the
+other value (so regardless the missing value would be ``True`` or ``False``).
+In this case, ``pd.NA`` does not propagate:
+
+.. ipython:: python
+
+   True | False
+   True | pd.NA
+   pd.NA | True
+
+On the other hand, if one of the operands is ``False``, the result depends
+on the value of the other operand. Therefore, in this case ``pd.NA``
+propagates:
+
+.. ipython:: python
+
+   False | True
+   False | False
+   False | pd.NA
+
+The behaviour of the logical "and" operation (``&``) can be derived using
+similar logic (where now ``pd.NA`` will not propagate if one of the operands
+is already ``False``):
+
+.. ipython:: python
+
+   False & True
+   False & False
+   False & pd.NA
+
+.. ipython:: python
+
+   True & True
+   True & False
+   True & pd.NA
+
+
+``NA`` in a boolean context
+---------------------------
+
+Since the actual value of an NA is unknown, it is ambiguous to convert NA
+to a boolean value. The following raises an error:
+
+.. ipython:: python
+   :okexcept:
+
+   bool(pd.NA)
+
+This also means that ``pd.NA`` cannot be used in a context where it is
+evaluated to a boolean, such as ``if condition: ...`` where ``condition`` can
+potentially be ``pd.NA``. In such cases, :func:`isna` can be used to check
+for ``pd.NA`` or ``condition`` being ``pd.NA`` can be avoided, for example by
+filling missing values beforehand.
+
+A similar situation occurs when using Series or DataFrame objects in ``if``
+statements, see :ref:`gotchas.truth`.
diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb
@@ -677,7 +677,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Notice that you're able share the styles even though they're data aware. The styles are re-evaluated on the new DataFrame they've been `use`d upon."
+    "Notice that you're able to share the styles even though they're data aware. The styles are re-evaluated on the new DataFrame they've been `use`d upon."
    ]
   },
   {

diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst
@@ -312,14 +312,13 @@ Timezone handling improvements
   previously this resulted in ``Exception`` or ``TypeError`` (:issue:`7812`)
 
   .. ipython:: python
-     :okwarning:
 
      ts = pd.Timestamp('2014-08-01 09:00', tz='US/Eastern')
      ts
      ts.tz_localize(None)
 
-     didx = pd.DatetimeIndex(start='2014-08-01 09:00', freq='H',
-                             periods=10, tz='US/Eastern')
+     didx = pd.date_range(start='2014-08-01 09:00', freq='H',
+                          periods=10, tz='US/Eastern')
      didx
      didx.tz_localize(None)
 

diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst
@@ -9,7 +9,7 @@ including other versions of pandas.
 I/O and LZMA
 ~~~~~~~~~~~~
 
-Some users may unknowingly have an incomplete Python installation lacking the `lzma` module from the standard library. In this case, `import pandas` failed due to an `ImportError` (:issue: `27575`).
+Some users may unknowingly have an incomplete Python installation lacking the `lzma` module from the standard library. In this case, `import pandas` failed due to an `ImportError` (:issue:`27575`).
 Pandas will now warn, rather than raising an `ImportError` if the `lzma` module is not present. Any subsequent attempt to use `lzma` methods will raise a `RuntimeError`.
 A possible fix for the lack of the `lzma` module is to ensure you have the necessary libraries and then re-install Python.
 For example, on MacOS installing Python with `pyenv` may lead to an incomplete Python installation due to unmet system dependencies at compilation time (like `xz`). Compilation will succeed, but Python might fail at run time. The issue can be solved by installing the necessary dependencies and then re-installing Python.