fix: remove incorrect fix for pandas regression

ibis-project · Mar 30, 2022 · 339f544 · 339f544
1 parent f2459eb
commit 339f544
Show file tree

Hide file tree

Showing 8 changed files with 124 additions and 11 deletions.
diff --git a/ibis/backends/pandas/aggcontext.py b/ibis/backends/pandas/aggcontext.py
@@ -596,9 +596,7 @@ def agg(
         indexed_by_ordering = frame[columns].copy()
         # placeholder column to compute window_sizes below
         indexed_by_ordering['_placeholder'] = 0
-        indexed_by_ordering = indexed_by_ordering.set_index(
-            order_by
-        ).sort_index(kind="stable")
+        indexed_by_ordering = indexed_by_ordering.set_index(order_by)
 
         # regroup if needed
         if group_by:

diff --git a/ibis/backends/pandas/tests/execution/test_timecontext.py b/ibis/backends/pandas/tests/execution/test_timecontext.py
@@ -1,6 +1,7 @@
 import pandas as pd
 import pandas.testing as tm
 import pytest
+from packaging.version import parse as vparse
 
 import ibis
 import ibis.common.exceptions as com
@@ -242,6 +243,11 @@ def test_context_adjustment_multi_window(time_table, time_df3):
     tm.assert_series_equal(result["v2"], expected_win_2)
 
 
+@pytest.mark.xfail(
+    condition=vparse("1.4") <= vparse(pd.__version__) < vparse("1.4.2"),
+    raises=ValueError,
+    reason="https://github.com/pandas-dev/pandas/pull/44068",
+)
 def test_context_adjustment_window_groupby_id(time_table, time_df3):
     """This test case is meant to test trim_window_result method
     in pandas/execution/window.py to see if it could trim Series

diff --git a/ibis/backends/pandas/tests/execution/test_window.py b/ibis/backends/pandas/tests/execution/test_window.py
@@ -1,8 +1,11 @@
+import io
+from datetime import date
 from operator import methodcaller
 
 import numpy as np
 import pandas as pd
 import pytest
+from packaging.version import parse as vparse
 from pandas import testing as tm
 
 import ibis
@@ -546,6 +549,11 @@ def test_window_with_preceding_expr(index):
     tm.assert_series_equal(result, expected)
 
 
+@pytest.mark.xfail(
+    condition=vparse("1.4") <= vparse(pd.__version__) < vparse("1.4.2"),
+    raises=ValueError,
+    reason="https://github.com/pandas-dev/pandas/pull/44068",
+)
 def test_window_with_mlb():
     index = pd.date_range('20170501', '20170507')
     data = np.random.randn(len(index), 3)
@@ -769,3 +777,66 @@ def count_complex(v):
     tm.assert_series_equal(result_nan, expected, check_names=False)
     tm.assert_series_equal(result_non_numeric, expected, check_names=False)
     tm.assert_series_equal(result_nan_non_numeric, expected, check_names=False)
+
+
+@pytest.fixture
+def events():
+    df = pd.DataFrame(
+        {
+            "event_id": [1] * 4 + [2] * 6 + [3] * 2,
+            "measured_on": map(
+                pd.Timestamp,
+                map(
+                    date,
+                    [2021] * 12,
+                    [6] * 4 + [5] * 6 + [7] * 2,
+                    range(1, 13),
+                ),
+            ),
+            "measurement": np.nan,
+        }
+    )
+    df.at[1, "measurement"] = 5.0
+    df.at[4, "measurement"] = 42.0
+    df.at[5, "measurement"] = 42.0
+    df.at[7, "measurement"] = 11.0
+    return df
+
+
+def test_bfill(events):
+    con = ibis.pandas.connect({"t": events})
+    t = con.table("t")
+
+    win = ibis.window(
+        group_by=t.event_id, order_by=ibis.desc(t.measured_on), following=0
+    )
+    grouped = t.mutate(grouper=t.measurement.count().over(win))
+
+    expr = (
+        grouped.group_by([grouped.event_id, grouped.grouper])
+        .mutate(bfill=grouped.measurement.max())
+        .sort_by("measured_on")
+    )
+    result = expr.execute().reset_index(drop=True)
+
+    expected_raw = """\
+event_id measured_on  measurement  grouper  bfill
+       2  2021-05-05         42.0        3   42.0
+       2  2021-05-06         42.0        2   42.0
+       2  2021-05-07          NaN        1   11.0
+       2  2021-05-08         11.0        1   11.0
+       2  2021-05-09          NaN        0    NaN
+       2  2021-05-10          NaN        0    NaN
+       1  2021-06-01          NaN        1    5.0
+       1  2021-06-02          5.0        1    5.0
+       1  2021-06-03          NaN        0    NaN
+       1  2021-06-04          NaN        0    NaN
+       3  2021-07-11          NaN        0    NaN
+       3  2021-07-12          NaN        0    NaN"""
+    expected = pd.read_csv(
+        io.StringIO(expected_raw),
+        sep=r"\s+",
+        header=0,
+        parse_dates=["measured_on"],
+    )
+    tm.assert_frame_equal(result, expected)
diff --git a/ibis/backends/pandas/tests/test_udf.py b/ibis/backends/pandas/tests/test_udf.py
@@ -4,6 +4,7 @@
 import pandas as pd
 import pandas._testing as tm
 import pytest
+from packaging.version import parse as vparse
 
 import ibis
 import ibis.expr.datatypes as dt
@@ -256,6 +257,11 @@ def test_udaf_window(t2, df2):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.xfail(
+    condition=vparse("1.4") <= vparse(pd.__version__) < vparse("1.4.2"),
+    raises=ValueError,
+    reason="https://github.com/pandas-dev/pandas/pull/44068",
+)
 def test_udaf_window_interval():
     df = pd.DataFrame(
         collections.OrderedDict(

diff --git a/ibis/backends/tests/test_timecontext.py b/ibis/backends/tests/test_timecontext.py
@@ -1,6 +1,7 @@
 import pandas as pd
 import pandas.testing as tm
 import pytest
+from pytest import param
 
 import ibis
 from ibis.config import option_context
@@ -42,11 +43,23 @@ def filter_by_time_context(df, context):
 @pytest.mark.parametrize(
     'window',
     [
-        ibis.trailing_window(ibis.interval(days=3), order_by=ORDERBY_COL),
-        ibis.trailing_window(
-            ibis.interval(days=3),
-            order_by=ORDERBY_COL,
-            group_by=GROUPBY_COL,
+        param(
+            ibis.trailing_window(ibis.interval(days=3), order_by=ORDERBY_COL),
+            id="order_by",
+        ),
+        param(
+            ibis.trailing_window(
+                ibis.interval(days=3),
+                order_by=ORDERBY_COL,
+                group_by=GROUPBY_COL,
+            ),
+            id="order_by_group_by",
+            marks=[
+                pytest.mark.broken(
+                    ["pandas"],
+                    reason="https://github.com/pandas-dev/pandas/pull/44068",
+                )
+            ],
         ),
     ],
 )

diff --git a/ibis/tests/benchmarks/test_benchmarks.py b/ibis/tests/benchmarks/test_benchmarks.py
@@ -7,6 +7,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from packaging.version import parse as vparse
 
 import ibis
 import ibis.expr.datatypes as dt
@@ -293,6 +294,13 @@ def high_card_grouped_rolling_udf_wm(t):
     return my_wm(t.value, t.value).over(low_card_rolling_window(t))
 
 
+broken_pandas_grouped_rolling = pytest.mark.xfail(
+    condition=vparse("1.4") <= vparse(pd.__version__) < vparse("1.4.2"),
+    raises=ValueError,
+    reason="https://github.com/pandas-dev/pandas/pull/44068",
+)
+
+
 @pytest.mark.benchmark(group="execution")
 @pytest.mark.parametrize(
     "expression_fn",
@@ -309,17 +317,25 @@ def high_card_grouped_rolling_udf_wm(t):
         pytest.param(simple_sort_projection, id="simple_sort_projection"),
         pytest.param(multikey_sort, id="multikey_sort"),
         pytest.param(multikey_sort_projection, id="multikey_sort_projection"),
-        pytest.param(low_card_grouped_rolling, id="low_card_grouped_rolling"),
         pytest.param(
-            high_card_grouped_rolling, id="high_card_grouped_rolling"
+            low_card_grouped_rolling,
+            id="low_card_grouped_rolling",
+            marks=[broken_pandas_grouped_rolling],
+        ),
+        pytest.param(
+            high_card_grouped_rolling,
+            id="high_card_grouped_rolling",
+            marks=[broken_pandas_grouped_rolling],
         ),
         pytest.param(
             low_card_grouped_rolling_udf_mean,
             id="low_card_grouped_rolling_udf_mean",
+            marks=[broken_pandas_grouped_rolling],
         ),
         pytest.param(
             high_card_grouped_rolling_udf_mean,
             id="high_card_grouped_rolling_udf_mean",
+            marks=[broken_pandas_grouped_rolling],
         ),
         pytest.param(
             low_card_window_analytics_udf, id="low_card_window_analytics_udf"
@@ -330,10 +346,12 @@ def high_card_grouped_rolling_udf_wm(t):
         pytest.param(
             low_card_grouped_rolling_udf_wm,
             id="low_card_grouped_rolling_udf_wm",
+            marks=[broken_pandas_grouped_rolling],
         ),
         pytest.param(
             high_card_grouped_rolling_udf_wm,
             id="high_card_grouped_rolling_udf_wm",
+            marks=[broken_pandas_grouped_rolling],
         ),
     ],
 )

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -73,6 +73,7 @@ mkdocs-material = ">=8.2.1,<9"
 mkdocs-table-reader-plugin = ">=1.0.0,<2"
 mkdocstrings = ">=0.17.0,<0.18.0"
 mypy = "0.942"
+packaging = ">=21.3,<22"
 pyarrow = ">=1,<8"
 pydocstyle = ">=6.1.1,<7"
 pymdown-extensions = ">=9.1,<10"