depr(api): deprecate filtering/expression projection in `Table.__geti…

…tem__`
jcrist · Sep 9, 2024 · 0d9acc1 · 0d9acc1
1 parent dfa55b6
commit 0d9acc1
Show file tree

Hide file tree

Showing 65 changed files with 454 additions and 640 deletions.
diff --git a/docs/how-to/extending/builtin.qmd b/docs/how-to/extending/builtin.qmd
@@ -79,7 +79,7 @@ rest of the library:
 pkgs = ibis.read_parquet(
    "https://storage.googleapis.com/ibis-tutorial-data/pypi/2024-04-24/packages.parquet"
 )
-pandas_ish = pkgs[jw_sim(pkgs.name, "pandas") >= 0.9]
+pandas_ish = pkgs.filter(jw_sim(pkgs.name, "pandas") >= 0.9)
 pandas_ish
 ```
 

diff --git a/docs/tutorials/ibis-for-pandas-users.qmd b/docs/tutorials/ibis-for-pandas-users.qmd
@@ -126,13 +126,6 @@ Selecting columns is very similar to in pandas. In fact, you can use the same sy
 t[["one", "two"]]
 ```
 
-However, since row-level indexing is not supported in Ibis, the inner list is not necessary.
-
-
-```{python}
-t["one", "two"]
-```
-
 ## Selecting columns
 
 Selecting columns is done using the same syntax as in pandas `DataFrames`. You can use either
@@ -192,11 +185,11 @@ new_col = unnamed.name("new_col")
 new_col
 ```
 
-You can then add this column to the table using a projection.
+You can then add this column to the table using `mutate`
 
 
 ```{python}
-proj = t["one", "two", new_col]
+proj = t.mutate(new_col)
 proj
 ```
 
@@ -301,10 +294,9 @@ penguins.limit(5)
 ### Filtering rows
 
 In addition to limiting the number of rows that are returned, it is possible to
-filter the rows using expressions. Expressions are constructed very similarly to
-the way they are in pandas.  Ibis expressions are constructed from operations on
-columns in a table which return a boolean result.  This result is then used to
-filter the table.
+filter the rows using expressions. This is done using the `filter` method in
+ibis. Ibis expressions are constructed from operations on columns in a table
+which return a boolean result. This result is then used to filter the table.
 
 
 ```{python}
@@ -324,32 +316,30 @@ get 6 rows back.
 
 
 ```{python}
-filtered = penguins[expr]
+filtered = penguins.filter(expr)
 filtered
 ```
 
 Of course, the filtering expression can be applied inline as well.
 
 
 ```{python}
-filtered = penguins[penguins.bill_length_mm > 37.0]
+filtered = penguins.filter(penguins.bill_length_mm > 37.0)
 filtered
 ```
 
-Multiple filtering expressions can be combined into a single expression or chained onto existing
-table expressions.
+Multiple filtering expressions may be passed in to a single call (filtering
+only rows where they're all true), or combined together using common boolean
+operators like (`&`, `|`). The expressions below are equivalent:
 
 
 ```{python}
-filtered = penguins[(penguins.bill_length_mm > 37.0) & (penguins.bill_depth_mm > 18.0)]
+filtered = penguins.filter(penguins.bill_length_mm > 37.0, penguins.bill_depth_mm > 18.0)
 filtered
 ```
 
-The code above will return the same rows as the code below.
-
-
 ```{python}
-filtered = penguins[penguins.bill_length_mm > 37.0][penguins.bill_depth_mm > 18.0]
+filtered = penguins.filter((penguins.bill_length_mm > 37.0) & (penguins.bill_depth_mm > 18.0))
 filtered
 ```
 
@@ -359,7 +349,7 @@ is greater than the mean.
 
 
 ```{python}
-filtered = penguins[penguins.bill_length_mm > penguins.bill_length_mm.mean()]
+filtered = penguins.filter(penguins.bill_length_mm > penguins.bill_length_mm.mean())
 filtered
 ```
 

diff --git a/docs/tutorials/ibis-for-sql-users.qmd b/docs/tutorials/ibis-for-sql-users.qmd
@@ -46,12 +46,6 @@ FROM my_data
 
 In Ibis, this is
 
-```{python}
-proj = t["two", "one"]
-```
-
-or
-
 ```{python}
 proj = t.select("two", "one")
 ```
@@ -78,7 +72,7 @@ new_col = (t.three * 2).name("new_col")
 Now, we have:
 
 ```{python}
-proj = t["two", "one", new_col]
+proj = t.select("two", "one", new_col)
 ibis.to_sql(proj)
 ```
 
@@ -113,15 +107,15 @@ select all columns in a table using the `SELECT *` construct. To do this, use
 the table expression itself in a projection:
 
 ```{python}
-proj = t[t]
+proj = t.select(t)
 ibis.to_sql(proj)
 ```
 
 This is how `mutate` is implemented. The example above
 `t.mutate(new_col=t.three * 2)` can be written as a normal projection:
 
 ```{python}
-proj = t[t, new_col]
+proj = t.select(t, new_col)
 ibis.to_sql(proj)
 ```
 
@@ -144,7 +138,7 @@ To write this with Ibis, it is:
 
 ```{python}
 diff = (t.two - t2.value).name("diff")
-joined = t.join(t2, t.one == t2.key)[t, diff]
+joined = t.join(t2, t.one == t2.key).select(t, diff)
 ```
 
 And verify the generated SQL:
@@ -188,19 +182,18 @@ ibis.to_sql(expr)
 
 ## Filtering / `WHERE`
 
-You can add filter clauses to a table expression either by indexing with
-`[]` (similar to pandas) or use the `filter` method:
+You can add filter clauses to a table expression by using the `filter` method:
 
 ```{python}
-filtered = t[t.two > 0]
+filtered = t.filter(t.two > 0)
 ibis.to_sql(filtered)
 ```
 
-`filter` can take a list of expressions, which must all be satisfied for
+`filter` can take multiple expressions, which must all be satisfied for
 a row to appear in the result:
 
 ```{python}
-filtered = t.filter([t.two > 0, t.one.isin(["A", "B"])])
+filtered = t.filter(t.two > 0, t.one.isin(["A", "B"]))
 ibis.to_sql(filtered)
 ```
 
@@ -209,7 +202,7 @@ To compose boolean expressions with `AND` or `OR`, use the respective
 
 ```{python}
 cond = (t.two < 0) | ((t.two > 0) | t.one.isin(["A", "B"]))
-filtered = t[cond]
+filtered = t.filter(cond)
 ibis.to_sql(filtered)
 ```
 
@@ -617,7 +610,7 @@ ibis.to_sql(expr)
 
 ```{python}
 agged = (
-    expr[expr.one.notnull()]
+    expr.filter(expr.one.notnull())
     .group_by("is_valid")
     .aggregate(three_count=lambda t: t.three.notnull().sum())
 )
@@ -632,7 +625,7 @@ keyword. The result of `between` is boolean and can be used with any
 other boolean expression:
 
 ```{python}
-expr = t[t.two.between(10, 50) & t.one.notnull()]
+expr = t.filter(t.two.between(10, 50) & t.one.notnull())
 ibis.to_sql(expr)
 ```
 
@@ -684,15 +677,15 @@ After one or more joins, you can reference any of the joined tables in
 a projection immediately after:
 
 ```{python}
-expr = joined[t1, t2.value2]
+expr = joined.select(t1, t2.value2)
 ibis.to_sql(expr)
 ```
 
 If you need to compute an expression that involves both tables, you can
 do that also:
 
 ```{python}
-expr = joined[t1.key1, (t1.value1 - t2.value2).name("diff")]
+expr = joined.select(t1.key1, (t1.value1 - t2.value2).name("diff"))
 ibis.to_sql(expr)
 ```
 
@@ -800,15 +793,15 @@ In these case, we can specify a list of common join keys:
 
 ```{python}
 joined = t4.join(t5, ["key1", "key2", "key3"])
-expr = joined[t4, t5.value2]
+expr = joined.select(t4, t5.value2)
 ibis.to_sql(expr)
 ```
 
 You can mix the overlapping key names with other expressions:
 
 ```{python}
 joined = t4.join(t5, ["key1", "key2", t4.key3.left(4) == t5.key3.left(4)])
-expr = joined[t4, t5.value2]
+expr = joined.select(t4, t5.value2)
 ibis.to_sql(expr)
 ```
 
@@ -885,15 +878,15 @@ cond = (events.user_id == purchases.user_id).any()
 This can now be used to filter `events`:
 
 ```{python}
-expr = events[cond]
+expr = events.filter(cond)
 ibis.to_sql(expr)
 ```
 
 If you negate the condition, it will instead give you only event data
 from user *that have not made a purchase*:
 
 ```{python}
-expr = events[-cond]
+expr = events.filter(-cond)
 ibis.to_sql(expr)
 ```
 
@@ -916,7 +909,7 @@ you can write with Ibis:
 
 ```{python}
 cond = events.user_id.isin(purchases.user_id)
-expr = events[cond]
+expr = events.filter(cond)
 ibis.to_sql(expr)
 ```
 
@@ -941,7 +934,7 @@ WHERE value1 > (
 With Ibis, the code is simpler and more pandas-like:
 
 ```{python}
-expr = t1[t1.value1 > t2.value2.max()]
+expr = t1.filter(t1.value1 > t2.value2.max())
 ibis.to_sql(expr)
 ```
 
@@ -968,8 +961,8 @@ With Ibis, the code is similar, but you add the correlated filter to the
 average statistic:
 
 ```{python}
-stat = t2[t1.key1 == t2.key3].value2.mean()
-expr = t1[t1.value1 > stat]
+stat = t2.filter(t1.key1 == t2.key3).value2.mean()
+expr = t1.filter(t1.value1 > stat)
 ibis.to_sql(expr)
 ```
 
@@ -1118,7 +1111,7 @@ Ibis provides a `row_number()` function that allows you to do this:
 expr = purchases.mutate(
   row_number=ibis.row_number().over(group_by=[_.user_id], order_by=_.price)
 )
-expr = expr[_.row_number < 3]
+expr = expr.filter(_.row_number < 3)
 ```
 
 The output of this is a table with the three most expensive items that each user has purchased
@@ -1149,7 +1142,7 @@ Ibis has a set of interval APIs that allow you to do date/time
 arithmetic. For example:
 
 ```{python}
-expr = events[events.ts > (ibis.now() - ibis.interval(years=1))]
+expr = events.filter(events.ts > (ibis.now() - ibis.interval(years=1)))
 ibis.to_sql(expr)
 ```
 
@@ -1214,12 +1207,13 @@ purchases = ibis.table(
 metric = purchases.amount.sum().name("total")
 agged = purchases.group_by(["region", "kind"]).aggregate(metric)
 
-left = agged[agged.kind == "foo"]
-right = agged[agged.kind == "bar"]
+left = agged.filter(agged.kind == "foo")
+right = agged.filter(agged.kind == "bar")
 
-result = left.join(right, left.region == right.region)[
-    left.region, (left.total - right.total).name("diff")
-]
+result = (
+    left.join(right, left.region == right.region)
+    .select(left.region, (left.total - right.total).name("diff"))
+)
 ```
 
 Ibis automatically creates a CTE for `agged`:

diff --git a/docs/tutorials/open-source-software/apache-flink/1_single_feature.qmd b/docs/tutorials/open-source-software/apache-flink/1_single_feature.qmd
@@ -184,7 +184,7 @@ transaction count over the past five hours may be useful features. Let’s write
 out each of these using Ibis API:
 
 ```{python}
-user_trans_amt_last_360m_agg = source_table[
+user_trans_amt_last_360m_agg = source_table.select(
     source_table.user_id,
     # Calculate the average transaction amount over the past six hours
     source_table.amt.mean()
@@ -207,7 +207,7 @@ user_trans_amt_last_360m_agg = source_table[
     )
     .name("user_trans_count_last_360min"),
     source_table.trans_date_trans_time,
-]
+)
 ```
 
 `over()` creates an [over

diff --git a/ibis/backends/bigquery/tests/system/test_client.py b/ibis/backends/bigquery/tests/system/test_client.py
@@ -186,7 +186,7 @@ def test_scalar_param_partition_time(parted_alltypes):
     assert "PARTITIONTIME" in parted_alltypes.columns
     assert "PARTITIONTIME" in parted_alltypes.schema()
     param = ibis.param("timestamp('UTC')")
-    expr = parted_alltypes[param > parted_alltypes.PARTITIONTIME]
+    expr = parted_alltypes.filter(param > parted_alltypes.PARTITIONTIME)
     df = expr.execute(params={param: "2017-01-01"})
     assert df.empty
 
@@ -201,7 +201,7 @@ def test_parted_column(con, kind):
 
 def test_cross_project_query(public):
     table = public.table("posts_questions")
-    expr = table[table.tags.contains("ibis")][["title", "tags"]]
+    expr = table.filter(table.tags.contains("ibis"))[["title", "tags"]]
     n = 5
     df = expr.limit(n).execute()
     assert len(df) == n
@@ -231,7 +231,7 @@ def test_multiple_project_queries_execute(con):
     trips = con.table("trips", database="nyc-tlc.yellow").limit(5)
     predicate = posts_questions.tags == trips.rate_code
     cols = [posts_questions.title]
-    join = posts_questions.left_join(trips, predicate)[cols]
+    join = posts_questions.left_join(trips, predicate).select(cols)
     result = join.execute()
     assert list(result.columns) == ["title"]
     assert len(result) == 5