Merge remote-tracking branch 'upstream/master' into ea-take

TomAugspurger · Apr 25, 2018 · 37915e9 · 37915e9
2 parents 67ba9dd + 60fe82c
commit 37915e9
Show file tree

Hide file tree

Showing 10 changed files with 211 additions and 32 deletions.
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -148,3 +148,24 @@ def time_rank_int_cat(self):
 
     def time_rank_int_cat_ordered(self):
         self.s_int_cat_ordered.rank()
+
+
+class Isin(object):
+
+    goal_time = 0.2
+
+    params = ['object', 'int64']
+    param_names = ['dtype']
+
+    def setup(self, dtype):
+        np.random.seed(1234)
+        n = 5 * 10**5
+        sample_size = 100
+        arr = [i for i in np.random.randint(0, n // 10, size=n)]
+        if dtype == 'object':
+            arr = ['s%04d' % i for i in arr]
+        self.sample = np.random.choice(arr, sample_size)
+        self.series = pd.Series(arr).astype('category')
+
+    def time_isin_categorical(self, dtype):
+        self.series.isin(self.sample)
diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst
@@ -1773,7 +1773,7 @@ These both yield the same results, so which should you use? It is instructive to
 of operations on these and why method 2 (``.loc``) is much preferred over method 1 (chained ``[]``).
 
 ``dfmi['one']`` selects the first level of the columns and returns a DataFrame that is singly-indexed.
-Then another Python operation ``dfmi_with_one['second']`` selects the series indexed by ``'second'`` happens.
+Then another Python operation ``dfmi_with_one['second']`` selects the series indexed by ``'second'``.
 This is indicated by the variable ``dfmi_with_one`` because pandas sees these operations as separate events.
 e.g. separate calls to ``__getitem__``, so it has to treat them as linear operations, they happen one after another.
 

diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -954,6 +954,7 @@ Performance Improvements
 - Improved performance of :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` (:issue:`11296`)
 - Improved performance of :func:`pandas.core.groupby.GroupBy.any` and :func:`pandas.core.groupby.GroupBy.all` (:issue:`15435`)
 - Improved performance of :func:`pandas.core.groupby.GroupBy.pct_change` (:issue:`19165`)
+- Improved performance of :func:`Series.isin` in the case of categorical dtypes (:issue:`20003`)
 - Fixed a performance regression for :func:`GroupBy.nth` and :func:`GroupBy.last` with some object columns (:issue:`19283`)
 
 .. _whatsnew_0230.docs:
@@ -1156,6 +1157,10 @@ I/O
 - Bug in :func:`read_csv` causing heap corruption on 32-bit, big-endian architectures (:issue:`20785`)
 - Bug in :func:`read_sas` where a file with 0 variables gave an ``AttributeError`` incorrectly. Now it gives an ``EmptyDataError`` (:issue:`18184`)
 - Bug in :func:`DataFrame.to_latex()` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`)
+- Bug in :func:`DataFrame.to_latex()` where a ``NaN`` in a ``MultiIndex`` would cause an ``IndexError`` or incorrect output (:issue:`14249`)
+- Bug in :func:`DataFrame.to_latex()` where a non-string index-level name would result in an ``AttributeError`` (:issue:`19981`)
+- Bug in :func:`DataFrame.to_latex()` where the combination of an index name and the `index_names=False` option would result in incorrect output (:issue:`18326`)
+- Bug in :func:`DataFrame.to_latex()` where a ``MultiIndex`` with an empty string as its name would result in incorrect output (:issue:`18669`)
 - Bug in :func:`read_json` where large numeric values were causing an ``OverflowError`` (:issue:`18842`)
 - Bug in :func:`DataFrame.to_parquet` where an exception was raised if the write destination is S3 (:issue:`19134`)
 - :class:`Interval` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -408,6 +408,13 @@ def isin(comps, values):
     if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)):
         values = construct_1d_object_array_from_listlike(list(values))
 
+    if is_categorical_dtype(comps):
+        # TODO(extension)
+        # handle categoricals
+        return comps._values.isin(values)
+
+    comps = com._values_from_object(comps)
+
     comps, dtype, _ = _ensure_data(comps)
     values, _, _ = _ensure_data(values, dtype=dtype)
 

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -39,6 +39,8 @@
 from pandas.util._decorators import (
     Appender, cache_readonly, deprecate_kwarg, Substitution)
 
+import pandas.core.algorithms as algorithms
+
 from pandas.io.formats.terminal import get_terminal_size
 from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
 from pandas.core.config import get_option
@@ -2216,6 +2218,60 @@ def _concat_same_type(self, to_concat):
     def _formatting_values(self):
         return self
 
+    def isin(self, values):
+        """
+        Check whether `values` are contained in Categorical.
+
+        Return a boolean NumPy Array showing whether each element in
+        the Categorical matches an element in the passed sequence of
+        `values` exactly.
+
+        Parameters
+        ----------
+        values : set or list-like
+            The sequence of values to test. Passing in a single string will
+            raise a ``TypeError``. Instead, turn a single string into a
+            list of one element.
+
+        Returns
+        -------
+        isin : numpy.ndarray (bool dtype)
+
+        Raises
+        ------
+        TypeError
+          * If `values` is not a set or list-like
+
+        See Also
+        --------
+        pandas.Series.isin : equivalent method on Series
+
+        Examples
+        --------
+
+        >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama',
+        ...                'hippo'])
+        >>> s.isin(['cow', 'lama'])
+        array([ True,  True,  True, False,  True, False])
+
+        Passing a single string as ``s.isin('lama')`` will raise an error. Use
+        a list of one element instead:
+
+        >>> s.isin(['lama'])
+        array([ True, False,  True, False,  True, False])
+        """
+        from pandas.core.series import _sanitize_array
+        if not is_list_like(values):
+            raise TypeError("only list-like objects are allowed to be passed"
+                            " to isin(), you passed a [{values_type}]"
+                            .format(values_type=type(values).__name__))
+        values = _sanitize_array(values, None, None)
+        null_mask = np.asarray(isna(values))
+        code_values = self.categories.get_indexer(values)
+        code_values = code_values[null_mask | (code_values >= 0)]
+        return algorithms.isin(self.codes, code_values)
+
+
 # The Series.cat accessor
 
 

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -3516,7 +3516,7 @@ def isin(self, values, level=None):
         """
         if level is not None:
             self._validate_index_level(level)
-        return algos.isin(np.array(self), values)
+        return algos.isin(self, values)
 
     def _can_reindex(self, indexer):
         """

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -3570,7 +3570,7 @@ def isin(self, values):
         5    False
         Name: animal, dtype: bool
         """
-        result = algorithms.isin(com._values_from_object(self), values)
+        result = algorithms.isin(self, values)
         return self._constructor(result, index=self.index).__finalize__(self)
 
     def between(self, left, right, inclusive=True):

diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py
@@ -64,35 +64,32 @@ def get_col_type(dtype):
 
         # reestablish the MultiIndex that has been joined by _to_str_column
         if self.fmt.index and isinstance(self.frame.index, MultiIndex):
+            out = self.frame.index.format(
+                adjoin=False, sparsify=self.fmt.sparsify,
+                names=self.fmt.has_index_names, na_rep=self.fmt.na_rep
+            )
+
+            # index.format will sparsify repeated entries with empty strings
+            # so pad these with some empty space
+            def pad_empties(x):
+                for pad in reversed(x):
+                    if pad:
+                        break
+                return [x[0]] + [i if i else ' ' * len(pad) for i in x[1:]]
+            out = (pad_empties(i) for i in out)
+
+            # Add empty spaces for each column level
             clevels = self.frame.columns.nlevels
-            strcols.pop(0)
-            name = any(self.frame.index.names)
-            cname = any(self.frame.columns.names)
-            lastcol = self.frame.index.nlevels - 1
-            previous_lev3 = None
-            for i, lev in enumerate(self.frame.index.levels):
-                lev2 = lev.format()
-                blank = ' ' * len(lev2[0])
-                # display column names in last index-column
-                if cname and i == lastcol:
-                    lev3 = [x if x else '{}' for x in self.frame.columns.names]
-                else:
-                    lev3 = [blank] * clevels
-                if name:
-                    lev3.append(lev.name)
-                current_idx_val = None
-                for level_idx in self.frame.index.labels[i]:
-                    if ((previous_lev3 is None or
-                        previous_lev3[len(lev3)].isspace()) and
-                            lev2[level_idx] == current_idx_val):
-                        # same index as above row and left index was the same
-                        lev3.append(blank)
-                    else:
-                        # different value than above or left index different
-                        lev3.append(lev2[level_idx])
-                        current_idx_val = lev2[level_idx]
-                strcols.insert(i, lev3)
-                previous_lev3 = lev3
+            out = [[' ' * len(i[-1])] * clevels + i for i in out]
+
+            # Add the column names to the last index column
+            cnames = self.frame.columns.names
+            if any(cnames):
+                new_names = [i if i else '{}' for i in cnames]
+                out[self.frame.index.nlevels - 1][:clevels] = new_names
+
+            # Get rid of old multiindex column and add new ones
+            strcols = out + strcols[1:]
 
         column_format = self.column_format
         if column_format is None:
@@ -118,7 +115,7 @@ def get_col_type(dtype):
         ilevels = self.frame.index.nlevels
         clevels = self.frame.columns.nlevels
         nlevels = clevels
-        if any(self.frame.index.names):
+        if self.fmt.has_index_names and self.fmt.show_index_names:
             nlevels += 1
         strrows = list(zip(*strcols))
         self.clinebuf = []

diff --git a/pandas/tests/categorical/test_algos.py b/pandas/tests/categorical/test_algos.py
@@ -47,3 +47,25 @@ def test_factorized_sort_ordered():
 
     tm.assert_numpy_array_equal(labels, expected_labels)
     tm.assert_categorical_equal(uniques, expected_uniques)
+
+
+def test_isin_cats():
+    # GH2003
+    cat = pd.Categorical(["a", "b", np.nan])
+
+    result = cat.isin(["a", np.nan])
+    expected = np.array([True, False, True], dtype=bool)
+    tm.assert_numpy_array_equal(expected, result)
+
+    result = cat.isin(["a", "c"])
+    expected = np.array([True, False, False], dtype=bool)
+    tm.assert_numpy_array_equal(expected, result)
+
+
+@pytest.mark.parametrize("empty", [[], pd.Series(), np.array([])])
+def test_isin_empty(empty):
+    s = pd.Categorical(["a", "b"])
+    expected = np.array([False, False], dtype=bool)
+
+    result = s.isin(empty)
+    tm.assert_numpy_array_equal(expected, result)
diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py
@@ -621,3 +621,74 @@ def test_to_latex_multiindex_names(self, name0, name1, axes):
 \end{tabular}
 """ % tuple(list(col_names) + [idx_names_row])
         assert observed == expected
+
+    @pytest.mark.parametrize('one_row', [True, False])
+    def test_to_latex_multiindex_nans(self, one_row):
+        # GH 14249
+        df = pd.DataFrame({'a': [None, 1], 'b': [2, 3], 'c': [4, 5]})
+        if one_row:
+            df = df.iloc[[0]]
+        observed = df.set_index(['a', 'b']).to_latex()
+        expected = r"""\begin{tabular}{llr}
+\toprule
+    &   &  c \\
+a & b &    \\
+\midrule
+NaN & 2 &  4 \\
+"""
+        if not one_row:
+            expected += r"""1.0 & 3 &  5 \\
+"""
+        expected += r"""\bottomrule
+\end{tabular}
+"""
+        assert observed == expected
+
+    def test_to_latex_non_string_index(self):
+        # GH 19981
+        observed = pd.DataFrame([[1, 2, 3]] * 2).set_index([0, 1]).to_latex()
+        expected = r"""\begin{tabular}{llr}
+\toprule
+  &   &  2 \\
+0 & 1 &    \\
+\midrule
+1 & 2 &  3 \\
+  & 2 &  3 \\
+\bottomrule
+\end{tabular}
+"""
+        assert observed == expected
+
+    def test_to_latex_midrule_location(self):
+        # GH 18326
+        df = pd.DataFrame({'a': [1, 2]})
+        df.index.name = 'foo'
+        observed = df.to_latex(index_names=False)
+        expected = r"""\begin{tabular}{lr}
+\toprule
+{} &  a \\
+\midrule
+0 &  1 \\
+1 &  2 \\
+\bottomrule
+\end{tabular}
+"""
+
+        assert observed == expected
+
+    def test_to_latex_multiindex_empty_name(self):
+        # GH 18669
+        mi = pd.MultiIndex.from_product([[1, 2]], names=[''])
+        df = pd.DataFrame(-1, index=mi, columns=range(4))
+        observed = df.to_latex()
+        expected = r"""\begin{tabular}{lrrrr}
+\toprule
+  &  0 &  1 &  2 &  3 \\
+{} &    &    &    &    \\
+\midrule
+1 & -1 & -1 & -1 & -1 \\
+2 & -1 & -1 & -1 & -1 \\
+\bottomrule
+\end{tabular}
+"""
+        assert observed == expected