diff --git a/docs/_freeze/posts/ibis-to-file/index/execute-results/html.json b/docs/_freeze/posts/ibis-to-file/index/execute-results/html.json index 2361b851b1de..e2beab68dfa7 100644 --- a/docs/_freeze/posts/ibis-to-file/index/execute-results/html.json +++ b/docs/_freeze/posts/ibis-to-file/index/execute-results/html.json @@ -1,14 +1,15 @@ { - "hash": "0fa2180e2b74f2857560218074347a81", + "hash": "9daa53e90728e309985442c8b5f0ac10", "result": { - "markdown": "---\ntitle: \"Ibis sneak peek: writing to files\"\nauthor: Kae Suarez\ndate: 2023-03-09\ncategories:\n - blog\n - io\n - new feature\n - sneak peek\n---\n\nIbis 5.0 is coming soon and will offer new functionality and fixes to users. To enhance clarity around this process, we’re sharing a sneak peek into what we’re working on.\n\nIn Ibis 4.0, we added the ability to read CSVs and Parquet via the Ibis interface. We felt this was important because, well, the ability to read files is simply necessary, be it on a local scale, legacy data, data not yet in a database, and so on. However, for a user, the natural next question was “can I go ahead and write when I’m done?” The answer was no. We didn’t like that, especially since we do care about file-based use cases.\n\nSo, we’ve gone ahead and fixed that for Ibis 5.0.\n\n## Files in, Files out\n\nBefore we can write a file, we need data — so let’s read in a file, to start this off:\n\n::: {#1c631a25 .cell execution_count=1}\n``` {.python .cell-code}\nimport ibis\n\nibis.options.interactive = True\n\nt = ibis.read_csv(\n \"https://storage.googleapis.com/ibis-examples/data/penguins.csv.gz\"\n)\nt\n```\n\n::: {.cell-output .cell-output-display execution_count=1}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃\n┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n│ stringstringfloat64float64int64int64stringint64 │\n├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n│ Adelie Torgersen39.118.71813750male  2007 │\n│ Adelie Torgersen39.517.41863800female2007 │\n│ Adelie Torgersen40.318.01953250female2007 │\n│ Adelie TorgersennannanNULLNULLNULL2007 │\n│ Adelie Torgersen36.719.31933450female2007 │\n│ Adelie Torgersen39.320.61903650male  2007 │\n│ Adelie Torgersen38.917.81813625female2007 │\n│ Adelie Torgersen39.219.61954675male  2007 │\n│ Adelie Torgersen34.118.11933475NULL2007 │\n│ Adelie Torgersen42.020.21904250NULL2007 │\n│  │\n└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘\n
\n```\n:::\n:::\n\n\nOf course, we could just write out, but let’s do an operation first — how about using selectors, which you can read more about [here](https://ibis-project.org/blog/selectors/)? Self-promotion aside, here’s an operation:\n\n::: {#4988920a .cell execution_count=2}\n``` {.python .cell-code}\nfrom ibis import _\nimport ibis.selectors as s\n\nexpr = (\n t.group_by(\"species\")\n .mutate(s.across(s.numeric() & ~s.c(\"year\"), (_ - _.mean()) / _.std()))\n)\nexpr\n```\n\n::: {.cell-output .cell-output-display execution_count=2}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃\n┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n│ stringstringfloat64float64float64float64stringint64 │\n├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n│ Adelie Biscoe   -1.423513-0.3668740.3129250.053074female2009 │\n│ Adelie Torgersen0.9794260.1262831.8421040.380180male  2009 │\n│ Adelie Torgersen1.542615-0.6134530.9245962.179266male  2008 │\n│ Adelie Biscoe   0.641513-0.366874-0.451665-1.091799female2007 │\n│ Adelie Biscoe   -0.2220431.3591770.0070890.434698male  2009 │\n│ Adelie Torgersen1.3924321.9345271.0775141.743124male  2007 │\n│ Adelie Torgersen1.1296100.8660191.2304321.634089male  2008 │\n│ Adelie Dream    -0.7476860.1262830.465843-0.437586female2009 │\n│ Adelie Dream    -0.860324-0.284681-1.216254-1.200835female2007 │\n│ Adelie Dream    0.7541510.0440900.7716780.434698male  2007 │\n│  │\n└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘\n
\n```\n:::\n:::\n\n\nNow, finally, time to do the exciting part:\n\n::: {#717f6f14 .cell execution_count=3}\n``` {.python .cell-code}\nexpr.to_parquet(\"normalized.parquet\")\n```\n:::\n\n\nLike many things in Ibis, this is as simple and plain-looking as it is important. Being able to create files from Ibis instead of redirecting into other libraries first enables operation at larger scales and fewer steps. Where desired, you can address a backend directly to use its native export functionality — we want to make sure you have the flexibility to use Ibis or the backend as you see fit.\n\n## Wrapping Up\n\nIbis is an interface tool for analytical engines that can reach scales far beyond a laptop. Files are important to Ibis because:\n\n- Ibis also supports local execution, where files are the standard unit of data — we want to support all our users.\n- Files are useful for moving between platforms, and long-term storage that isn’t tied to a particular backend.\n- Files can move more easily between our backends than database files, so we think this adds some convenience for the multi-backend use case.\n\nWe’re excited to release this functionality in Ibis 5.0.\n\nInterested in Ibis? Docs are available on this very website, at:\n\n- [Ibis Docs](https://ibis-project.org/)\n\nand the repo is always at:\n\n- [Ibis GitHub](https://github.com/ibis-project/ibis)\n\nPlease feel free to reach out on GitHub!\n\n", + "engine": "jupyter", + "markdown": "---\ntitle: \"Ibis sneak peek: writing to files\"\nauthor: Kae Suarez\ndate: 2023-03-09\ncategories:\n - blog\n - io\n - new feature\n - sneak peek\n---\n\n\nIbis 5.0 is coming soon and will offer new functionality and fixes to users. To enhance clarity around this process, we’re sharing a sneak peek into what we’re working on.\n\nIn Ibis 4.0, we added the ability to read CSVs and Parquet via the Ibis interface. We felt this was important because, well, the ability to read files is simply necessary, be it on a local scale, legacy data, data not yet in a database, and so on. However, for a user, the natural next question was “can I go ahead and write when I’m done?” The answer was no. We didn’t like that, especially since we do care about file-based use cases.\n\nSo, we’ve gone ahead and fixed that for Ibis 5.0.\n\n## Files in, Files out\n\nBefore we can write a file, we need data — so let’s read in a file, to start this off:\n\n::: {#6413c677 .cell execution_count=1}\n``` {.python .cell-code}\nimport ibis\n\nibis.options.interactive = True\n\nt = ibis.read_csv(\n \"https://storage.googleapis.com/ibis-examples/data/penguins.csv.gz\"\n)\nt\n```\n\n::: {.cell-output .cell-output-display execution_count=1}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃\n┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n│ stringstringfloat64float64int64int64stringint64 │\n├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n│ Adelie Torgersen39.118.71813750male  2007 │\n│ Adelie Torgersen39.517.41863800female2007 │\n│ Adelie Torgersen40.318.01953250female2007 │\n│ Adelie TorgersenNULLNULLNULLNULLNULL2007 │\n│ Adelie Torgersen36.719.31933450female2007 │\n│ Adelie Torgersen39.320.61903650male  2007 │\n│ Adelie Torgersen38.917.81813625female2007 │\n│ Adelie Torgersen39.219.61954675male  2007 │\n│ Adelie Torgersen34.118.11933475NULL2007 │\n│ Adelie Torgersen42.020.21904250NULL2007 │\n│  │\n└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘\n
\n```\n:::\n:::\n\n\nOf course, we could just write out, but let’s do an operation first — how about using selectors, which you can read more about [here](https://ibis-project.org/blog/selectors/)? Self-promotion aside, here’s an operation:\n\n::: {#d9639c15 .cell execution_count=2}\n``` {.python .cell-code}\nfrom ibis import _\nimport ibis.selectors as s\n\nexpr = (\n t.group_by(\"species\")\n .mutate(s.across(s.numeric() & ~s.cols(\"year\"), (_ - _.mean()) / _.std()))\n)\nexpr\n```\n\n::: {.cell-output .cell-output-display execution_count=2}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n┃ species  island  bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃\n┡━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n│ stringstringfloat64float64float64float64stringint64 │\n├─────────┼────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n│ Gentoo Biscoe-0.455854-1.816223-0.954050-1.142626female2007 │\n│ Gentoo Biscoe-0.975022-0.287513-0.491442-0.448342female2009 │\n│ Gentoo Biscoe0.387793-0.898997-1.108253-1.241809female2007 │\n│ Gentoo Biscoe0.8096160.2220560.1253681.237778male  2007 │\n│ Gentoo Biscoe0.030865-0.491341-0.3372400.642677male  2007 │\n│ Gentoo Biscoe-0.326062-1.510481-1.108253-1.043442female2007 │\n│ Gentoo Biscoe-0.682990-0.389427-0.954050-0.547525female2007 │\n│ Gentoo Biscoe-0.2611670.3239700.2795710.245943male  2007 │\n│ Gentoo Biscoe-1.364397-1.612395-1.262455-1.340993female2007 │\n│ Gentoo Biscoe-0.2287190.425884-0.3372400.146759male  2007 │\n│  │\n└─────────┴────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘\n
\n```\n:::\n:::\n\n\nNow, finally, time to do the exciting part:\n\n::: {#bf65d029 .cell execution_count=3}\n``` {.python .cell-code}\nexpr.to_parquet(\"normalized.parquet\")\n```\n:::\n\n\nLike many things in Ibis, this is as simple and plain-looking as it is important. Being able to create files from Ibis instead of redirecting into other libraries first enables operation at larger scales and fewer steps. Where desired, you can address a backend directly to use its native export functionality — we want to make sure you have the flexibility to use Ibis or the backend as you see fit.\n\n## Wrapping Up\n\nIbis is an interface tool for analytical engines that can reach scales far beyond a laptop. Files are important to Ibis because:\n\n- Ibis also supports local execution, where files are the standard unit of data — we want to support all our users.\n- Files are useful for moving between platforms, and long-term storage that isn’t tied to a particular backend.\n- Files can move more easily between our backends than database files, so we think this adds some convenience for the multi-backend use case.\n\nWe’re excited to release this functionality in Ibis 5.0.\n\nInterested in Ibis? Docs are available on this very website, at:\n\n- [Ibis Docs](https://ibis-project.org/)\n\nand the repo is always at:\n\n- [Ibis GitHub](https://github.com/ibis-project/ibis)\n\nPlease feel free to reach out on GitHub!\n\n", "supporting": [ "index_files" ], "filters": [], "includes": { "include-in-header": [ - "\n\n\n" + "\n\n\n" ] } } diff --git a/docs/_freeze/posts/selectors/index/execute-results/html.json b/docs/_freeze/posts/selectors/index/execute-results/html.json index b2f162ab91a1..4989326bce26 100644 --- a/docs/_freeze/posts/selectors/index/execute-results/html.json +++ b/docs/_freeze/posts/selectors/index/execute-results/html.json @@ -1,14 +1,15 @@ { - "hash": "f2bd6b42420644f3c1ab498e75534819", + "hash": "0d701ea3df138969aa31249331c958d4", "result": { - "markdown": "---\ntitle: \"Maximizing productivity with selectors\"\nauthor: Phillip Cloud\ndate: 2023-02-27\ncategories:\n - blog\n - new feature\n - productivity\n - duckdb\n---\n\nBefore Ibis 5.0 it's been challenging to concisely express whole-table\noperations with ibis. Happily this is no longer the case in ibis 5.0.\n\nLet's jump right in!\n\nWe'll look at selectors examples using the [`palmerpenguins` data\nset](https://allisonhorst.github.io/palmerpenguins/) with the [DuckDB\nbackend](../../backends/duckdb.qmd).\n\n## Setup\n\n::: {#ba6b57fd .cell execution_count=1}\n``` {.python .cell-code}\nfrom ibis.interactive import *\n\nt = ex.penguins.fetch()\nt\n```\n\n::: {.cell-output .cell-output-display execution_count=1}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃\n┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n│ stringstringfloat64float64int64int64stringint64 │\n├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n│ Adelie Torgersen39.118.71813750male  2007 │\n│ Adelie Torgersen39.517.41863800female2007 │\n│ Adelie Torgersen40.318.01953250female2007 │\n│ Adelie TorgersennannanNULLNULLNULL2007 │\n│ Adelie Torgersen36.719.31933450female2007 │\n│ Adelie Torgersen39.320.61903650male  2007 │\n│ Adelie Torgersen38.917.81813625female2007 │\n│ Adelie Torgersen39.219.61954675male  2007 │\n│ Adelie Torgersen34.118.11933475NULL2007 │\n│ Adelie Torgersen42.020.21904250NULL2007 │\n│  │\n└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘\n
\n```\n:::\n:::\n\n\n## Examples\n\n### Normalization\n\nLet's say you want to compute the\n[z-score](https://en.wikipedia.org/wiki/Standard_score) of every numeric column\nand replace the existing data with that normalized value. Here's how you'd do\nthat with selectors:\n\n::: {#c36a2829 .cell execution_count=2}\n``` {.python .cell-code}\nt.mutate(s.across(s.numeric(), (_ - _.mean()) / _.std()))\n```\n\n::: {.cell-output .cell-output-display execution_count=2}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━┓\n┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year      ┃\n┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━┩\n│ stringstringfloat64float64float64float64stringfloat64   │\n├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────────┤\n│ Adelie Torgersen-0.8832050.784300-1.416272-0.563317male  -1.257484 │\n│ Adelie Torgersen-0.8099390.126003-1.060696-0.500969female-1.257484 │\n│ Adelie Torgersen-0.6634080.429833-0.420660-1.186793female-1.257484 │\n│ Adelie TorgersennannannannanNULL-1.257484 │\n│ Adelie Torgersen-1.3227991.088129-0.562890-0.937403female-1.257484 │\n│ Adelie Torgersen-0.8465721.746426-0.776236-0.688012male  -1.257484 │\n│ Adelie Torgersen-0.9198370.328556-1.416272-0.719186female-1.257484 │\n│ Adelie Torgersen-0.8648881.240044-0.4206600.590115male  -1.257484 │\n│ Adelie Torgersen-1.7990250.480471-0.562890-0.906229NULL-1.257484 │\n│ Adelie Torgersen-0.3520291.543873-0.7762360.060160NULL-1.257484 │\n│  │\n└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────────┘\n
\n```\n:::\n:::\n\n\n### What's Up With the `year` Column?\n\nWhoops, looks like we included `year` in our normalization because it's an\n`int64` column (and therefore numeric) but normalizing the year doesn't make\nsense.\n\nWe can exclude `year` from the normalization using another selector:\n\n::: {#7c987863 .cell execution_count=3}\n``` {.python .cell-code}\nt.mutate(s.across(s.numeric() & ~s.c(\"year\"), (_ - _.mean()) / _.std()))\n```\n\n::: {.cell-output .cell-output-display execution_count=3}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃\n┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n│ stringstringfloat64float64float64float64stringint64 │\n├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n│ Adelie Torgersen-0.8832050.784300-1.416272-0.563317male  2007 │\n│ Adelie Torgersen-0.8099390.126003-1.060696-0.500969female2007 │\n│ Adelie Torgersen-0.6634080.429833-0.420660-1.186793female2007 │\n│ Adelie TorgersennannannannanNULL2007 │\n│ Adelie Torgersen-1.3227991.088129-0.562890-0.937403female2007 │\n│ Adelie Torgersen-0.8465721.746426-0.776236-0.688012male  2007 │\n│ Adelie Torgersen-0.9198370.328556-1.416272-0.719186female2007 │\n│ Adelie Torgersen-0.8648881.240044-0.4206600.590115male  2007 │\n│ Adelie Torgersen-1.7990250.480471-0.562890-0.906229NULL2007 │\n│ Adelie Torgersen-0.3520291.543873-0.7762360.060160NULL2007 │\n│  │\n└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘\n
\n```\n:::\n:::\n\n\n`c` is short for \"column\" and the `~` means \"negate\". Combining those we get \"not the year column\"!\n\nPretty neat right?\n\n### Composable Group By\n\nThe power of this approach comes in when you want the grouped version. Perhaps\nwe think some of these columns vary by species.\n\nWith selectors, all you need to do is slap a `.group_by(\"species\")` onto `t`:\n\n::: {#fe399083 .cell execution_count=4}\n``` {.python .cell-code}\nt.group_by(\"species\").mutate(\n s.across(s.numeric() & ~s.c(\"year\"), (_ - _.mean()) / _.std())\n)\n```\n\n::: {.cell-output .cell-output-display execution_count=4}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃\n┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n│ stringstringfloat64float64float64float64stringint64 │\n├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n│ Adelie Torgersen0.791697-1.2709970.160007-0.001444female2008 │\n│ Adelie Biscoe   1.467524-0.0381030.9245960.816322male  2009 │\n│ Adelie Dream    0.3786920.619441-0.9104182.070231male  2007 │\n│ Adelie Dream    -0.860324-0.284681-1.216254-1.200835female2007 │\n│ Adelie Torgersen-0.7852320.7838270.465843-0.546622female2007 │\n│ Adelie Torgersen0.1909621.8523350.007089-0.110480male  2007 │\n│ Adelie Torgersen0.040778-0.449067-1.369172-0.164997female2007 │\n│ Adelie Torgersen0.1534161.0304050.7716782.124749male  2007 │\n│ Adelie Torgersen-1.761426-0.2024890.465843-0.492104NULL2007 │\n│ Adelie Torgersen1.2047021.5235630.0070891.197947NULL2007 │\n│  │\n└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘\n
\n```\n:::\n:::\n\n\nSince ibis translates this into a run-of-the-mill selection as if you had\ncalled `select` or `mutate` without selectors, nothing special is needed for a\nbackend to work with these new constructs.\n\nLet's look at some more examples.\n\n### Min-max Normalization\n\nGrouped min/max normalization? Easy:\n\n::: {#f95a9fbf .cell execution_count=5}\n``` {.python .cell-code}\nt.group_by(\"species\").mutate(\n s.across(s.numeric() & ~s.c(\"year\"), (_ - _.min()) / (_.max() - _.min()))\n)\n```\n\n::: {.cell-output .cell-output-display execution_count=5}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃\n┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n│ stringstringfloat64float64float64float64stringint64 │\n├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n│ Adelie Torgersen0.6330940.2166670.5000000.441558female2008 │\n│ Adelie Biscoe   0.7625900.4666670.6315790.636364male  2009 │\n│ Adelie Dream    0.5539570.6000000.3157890.935065male  2007 │\n│ Adelie Dream    0.3165470.4166670.2631580.155844female2007 │\n│ Adelie Torgersen0.3309350.6333330.5526320.311688female2007 │\n│ Adelie Torgersen0.5179860.8500000.4736840.415584male  2007 │\n│ Adelie Torgersen0.4892090.3833330.2368420.402597female2007 │\n│ Adelie Torgersen0.5107910.6833330.6052630.948052male  2007 │\n│ Adelie Torgersen0.1438850.4333330.5526320.324675NULL2007 │\n│ Adelie Torgersen0.7122300.7833330.4736840.727273NULL2007 │\n│  │\n└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘\n
\n```\n:::\n:::\n\n\n### Casting and Munging\n\nHow about casting every column whose name ends with any of the strings `\"mm\"`\nor `\"g\"` to a `float32`? No problem!\n\n::: {#c1d8dafb .cell execution_count=6}\n``` {.python .cell-code}\nt.mutate(s.across(s.endswith((\"mm\", \"g\")), _.cast(\"float32\")))\n```\n\n::: {.cell-output .cell-output-display execution_count=6}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃\n┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n│ stringstringfloat32float32float32float32stringint64 │\n├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n│ Adelie Torgersen39.09999818.700001181.03750.0male  2007 │\n│ Adelie Torgersen39.50000017.400000186.03800.0female2007 │\n│ Adelie Torgersen40.29999918.000000195.03250.0female2007 │\n│ Adelie TorgersennannannannanNULL2007 │\n│ Adelie Torgersen36.70000119.299999193.03450.0female2007 │\n│ Adelie Torgersen39.29999920.600000190.03650.0male  2007 │\n│ Adelie Torgersen38.90000217.799999181.03625.0female2007 │\n│ Adelie Torgersen39.20000119.600000195.04675.0male  2007 │\n│ Adelie Torgersen34.09999818.100000193.03475.0NULL2007 │\n│ Adelie Torgersen42.00000020.200001190.04250.0NULL2007 │\n│  │\n└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘\n
\n```\n:::\n:::\n\n\nWe can make all string columns have the same case too!\n\n::: {#bb938f39 .cell execution_count=7}\n``` {.python .cell-code}\nt.mutate(s.across(s.of_type(\"string\"), _.lower()))\n```\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃\n┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n│ stringstringfloat64float64int64int64stringint64 │\n├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n│ adelie torgersen39.118.71813750male  2007 │\n│ adelie torgersen39.517.41863800female2007 │\n│ adelie torgersen40.318.01953250female2007 │\n│ adelie torgersennannanNULLNULLNULL2007 │\n│ adelie torgersen36.719.31933450female2007 │\n│ adelie torgersen39.320.61903650male  2007 │\n│ adelie torgersen38.917.81813625female2007 │\n│ adelie torgersen39.219.61954675male  2007 │\n│ adelie torgersen34.118.11933475NULL2007 │\n│ adelie torgersen42.020.21904250NULL2007 │\n│  │\n└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘\n
\n```\n:::\n:::\n\n\n### Multiple Computations per Column\n\nWhat if I want to compute multiple things? Heck yeah!\n\n::: {#d4c94ff9 .cell execution_count=8}\n``` {.python .cell-code}\nt.group_by(\"sex\").mutate(\n s.across(\n s.numeric() & ~s.c(\"year\"),\n dict(centered=_ - _.mean(), zscore=(_ - _.mean()) / _.std()),\n )\n).select(\"sex\", s.endswith((\"_centered\", \"_zscore\")))\n```\n\n::: {.cell-output .cell-output-display execution_count=8}\n```{=html}\n
┏━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n┃ sex     bill_length_mm_centered  bill_depth_mm_centered  flipper_length_mm_centered  body_mass_g_centered  bill_length_mm_zscore  bill_depth_mm_zscore  flipper_length_mm_zscore  body_mass_g_zscore ┃\n┡━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n│ stringfloat64float64float64float64float64float64float64float64            │\n├────────┼─────────────────────────┼────────────────────────┼────────────────────────────┼──────────────────────┼───────────────────────┼──────────────────────┼──────────────────────────┼────────────────────┤\n│ female4.10303-1.92545511.636364937.7272730.836760-1.0722700.9308511.407635 │\n│ female1.20303-2.42545510.636364712.7272730.245342-1.3507160.8508561.069885 │\n│ female-8.096970.674545-12.363636-462.272727-1.6512710.375649-0.989030-0.693924 │\n│ female-5.896970.874545-10.363636-562.272727-1.2026100.487027-0.829039-0.844035 │\n│ female-0.996971.174545-15.363636-662.272727-0.2033190.654095-1.229015-0.994147 │\n│ female-5.496971.374545-12.363636-162.272727-1.1210350.765473-0.989030-0.243590 │\n│ female-3.396972.574545-2.363636-412.272727-0.6927681.433743-0.189079-0.618868 │\n│ female-7.696971.974545-13.363636-537.272727-1.5696971.099608-1.069025-0.806507 │\n│ female-4.296971.874545-23.363636-462.272727-0.8763111.043919-1.868975-0.693924 │\n│ female-6.196972.774545-8.363636-62.272727-1.2637911.545122-0.669049-0.093478 │\n│  │\n└────────┴─────────────────────────┴────────────────────────┴────────────────────────────┴──────────────────────┴───────────────────────┴──────────────────────┴──────────────────────────┴────────────────────┘\n
\n```\n:::\n:::\n\n\nDon't like the naming convention?\n\nPass a function to make your own name!\n\n::: {#aeb79ffa .cell execution_count=9}\n``` {.python .cell-code}\nt.select(s.startswith(\"bill\")).mutate(\n s.across(\n s.all(),\n dict(x=_ - _.mean(), y=_.max()),\n names=lambda col, fn: f\"{col}_{fn}_improved\",\n )\n)\n```\n\n::: {.cell-output .cell-output-display execution_count=9}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n┃ bill_length_mm  bill_depth_mm  bill_length_mm_x_improved  bill_depth_mm_x_improved  bill_length_mm_y_improved  bill_depth_mm_y_improved ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n│ float64float64float64float64float64float64                  │\n├────────────────┼───────────────┼───────────────────────────┼──────────────────────────┼───────────────────────────┼──────────────────────────┤\n│           39.118.7-4.821931.5488359.621.5 │\n│           39.517.4-4.421930.2488359.621.5 │\n│           40.318.0-3.621930.8488359.621.5 │\n│            nannannannan59.621.5 │\n│           36.719.3-7.221932.1488359.621.5 │\n│           39.320.6-4.621933.4488359.621.5 │\n│           38.917.8-5.021930.6488359.621.5 │\n│           39.219.6-4.721932.4488359.621.5 │\n│           34.118.1-9.821930.9488359.621.5 │\n│           42.020.2-1.921933.0488359.621.5 │\n│               │\n└────────────────┴───────────────┴───────────────────────────┴──────────────────────────┴───────────────────────────┴──────────────────────────┘\n
\n```\n:::\n:::\n\n\nDon't like lambda functions? We support a format string too!\n\n::: {#5f3dd2c1 .cell execution_count=10}\n``` {.python .cell-code}\nt.select(s.startswith(\"bill\")).mutate(\n s.across(\n s.all(),\n func=dict(x=_ - _.mean(), y=_.max()),\n names=\"{col}_{fn}_improved\",\n )\n).head(2)\n```\n\n::: {.cell-output .cell-output-display execution_count=10}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n┃ bill_length_mm  bill_depth_mm  bill_length_mm_x_improved  bill_depth_mm_x_improved  bill_length_mm_y_improved  bill_depth_mm_y_improved ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n│ float64float64float64float64float64float64                  │\n├────────────────┼───────────────┼───────────────────────────┼──────────────────────────┼───────────────────────────┼──────────────────────────┤\n│           39.118.7-4.821931.5488359.621.5 │\n│           39.517.4-4.421930.2488359.621.5 │\n└────────────────┴───────────────┴───────────────────────────┴──────────────────────────┴───────────────────────────┴──────────────────────────┘\n
\n```\n:::\n:::\n\n\n### Working with other Ibis APIs\n\nWe've seen lots of mutate use, but selectors also work with `.agg`:\n\n::: {#6b61a552 .cell execution_count=11}\n``` {.python .cell-code}\nt.group_by(\"year\").agg(s.across(s.numeric() & ~s.c(\"year\"), _.mean())).order_by(\"year\")\n```\n\n::: {.cell-output .cell-output-display execution_count=11}\n```{=html}\n
┏━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ year   bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g ┃\n┡━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ int64float64float64float64float64     │\n├───────┼────────────────┼───────────────┼───────────────────┼─────────────┤\n│  200743.74036717.427523196.8807344124.541284 │\n│  200843.54122816.914035202.7982464266.666667 │\n│  200944.45294117.125210202.8067234210.294118 │\n└───────┴────────────────┴───────────────┴───────────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nNaturally, selectors work in grouping keys too, for even more convenience:\n\n::: {#4d5a0118 .cell execution_count=12}\n``` {.python .cell-code}\nt.group_by(~s.numeric() | s.c(\"year\")).mutate(\n s.across(s.numeric() & ~s.c(\"year\"), dict(centered=_ - _.mean(), std=_.std()))\n).select(\"species\", s.endswith((\"_centered\", \"_std\")))\n```\n\n::: {.cell-output .cell-output-display execution_count=12}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓\n┃ species  bill_length_mm_centered  bill_depth_mm_centered  flipper_length_mm_centered  body_mass_g_centered  bill_length_mm_std  bill_depth_mm_std  flipper_length_mm_std  body_mass_g_std ┃\n┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩\n│ stringfloat64float64float64float64float64float64float64float64         │\n├─────────┼─────────────────────────┼────────────────────────┼────────────────────────────┼──────────────────────┼────────────────────┼───────────────────┼───────────────────────┼─────────────────┤\n│ Adelie -1.460.400000-1.600000-170.0000001.3277800.6819092.302173189.076704 │\n│ Adelie -0.96-0.2000003.400000180.0000001.3277800.6819092.302173189.076704 │\n│ Adelie -0.36-1.100000-1.60000030.0000001.3277800.6819092.302173189.076704 │\n│ Adelie 1.440.3000001.400000-220.0000001.3277800.6819092.302173189.076704 │\n│ Adelie 1.340.600000-1.600000180.0000001.3277800.6819092.302173189.076704 │\n│ Gentoo 1.000.93529411.117647147.0588243.0567550.6707664.973459349.763576 │\n│ Gentoo 1.00-0.164706-0.882353147.0588243.0567550.6707664.973459349.763576 │\n│ Gentoo -1.40-0.864706-3.882353-152.9411763.0567550.6707664.973459349.763576 │\n│ Gentoo -2.30-0.0647060.117647-352.9411763.0567550.6707664.973459349.763576 │\n│ Gentoo -2.200.035294-3.882353-402.9411763.0567550.6707664.973459349.763576 │\n│  │\n└─────────┴─────────────────────────┴────────────────────────┴────────────────────────────┴──────────────────────┴────────────────────┴───────────────────┴───────────────────────┴─────────────────┘\n
\n```\n:::\n:::\n\n\n### Filtering Selectors\n\nYou can also express complex filters more concisely.\n\nLet's say we only want to keep rows where all the bill size z-score related\ncolumns' absolute values are greater than 2.\n\n::: {#c4a578ce .cell execution_count=13}\n``` {.python .cell-code}\nt.drop(\"year\").group_by(\"species\").mutate(\n s.across(s.numeric(), dict(zscore=(_ - _.mean()) / _.std()))\n).filter(s.if_all(s.startswith(\"bill\") & s.endswith(\"_zscore\"), _.abs() > 2))\n```\n\n::: {.cell-output .cell-output-display execution_count=13}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     bill_length_mm_zscore  bill_depth_mm_zscore  flipper_length_mm_zscore  body_mass_g_zscore ┃\n┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n│ stringstringfloat64float64int64int64stringfloat64float64float64float64            │\n├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────────────────────┼──────────────────────┼──────────────────────────┼────────────────────┤\n│ Adelie Torgersen46.021.51944200male  2.7065392.5920710.6187601.088911 │\n│ Adelie Dream    32.115.51883050female-2.512345-2.339505-0.298747-1.418906 │\n│ Gentoo Biscoe   55.917.02285600male  2.7240462.0565081.6673941.039411 │\n│ Gentoo Biscoe   59.617.02306050male  3.9246212.0565081.9757991.932062 │\n└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────────────────────┴──────────────────────┴──────────────────────────┴────────────────────┘\n
\n```\n:::\n:::\n\n\n### Bonus: Generated SQL\n\nThe SQL for that last expression is pretty gnarly:\n\n::: {#9890792a .cell execution_count=14}\n``` {.python .cell-code}\nibis.to_sql(\n t.drop(\"year\")\n .group_by(\"species\")\n .mutate(s.across(s.numeric(), dict(zscore=(_ - _.mean()) / _.std())))\n .filter(s.if_all(s.startswith(\"bill\") & s.endswith(\"_zscore\"), _.abs() > 2))\n)\n```\n\n::: {.cell-output .cell-output-display execution_count=14}\n```sql\nWITH t0 AS (\n SELECT\n t2.species AS species,\n t2.island AS island,\n t2.bill_length_mm AS bill_length_mm,\n t2.bill_depth_mm AS bill_depth_mm,\n t2.flipper_length_mm AS flipper_length_mm,\n t2.body_mass_g AS body_mass_g,\n t2.sex AS sex\n FROM main._ibis_examples_penguins_mqqdnfaydfbevoowdsf7djbsom AS t2\n), t1 AS (\n SELECT\n t0.species AS species,\n t0.island AS island,\n t0.bill_length_mm AS bill_length_mm,\n t0.bill_depth_mm AS bill_depth_mm,\n t0.flipper_length_mm AS flipper_length_mm,\n t0.body_mass_g AS body_mass_g,\n t0.sex AS sex,\n (\n t0.bill_length_mm - AVG(t0.bill_length_mm) OVER (PARTITION BY t0.species ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)\n ) / STDDEV_SAMP(t0.bill_length_mm) OVER (PARTITION BY t0.species ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS bill_length_mm_zscore,\n (\n t0.bill_depth_mm - AVG(t0.bill_depth_mm) OVER (PARTITION BY t0.species ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)\n ) / STDDEV_SAMP(t0.bill_depth_mm) OVER (PARTITION BY t0.species ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS bill_depth_mm_zscore,\n (\n t0.flipper_length_mm - AVG(t0.flipper_length_mm) OVER (PARTITION BY t0.species ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)\n ) / STDDEV_SAMP(t0.flipper_length_mm) OVER (PARTITION BY t0.species ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS flipper_length_mm_zscore,\n (\n t0.body_mass_g - AVG(t0.body_mass_g) OVER (PARTITION BY t0.species ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)\n ) / STDDEV_SAMP(t0.body_mass_g) OVER (PARTITION BY t0.species ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS body_mass_g_zscore\n FROM t0\n)\nSELECT\n t1.species,\n t1.island,\n t1.bill_length_mm,\n t1.bill_depth_mm,\n t1.flipper_length_mm,\n t1.body_mass_g,\n t1.sex,\n t1.bill_length_mm_zscore,\n t1.bill_depth_mm_zscore,\n t1.flipper_length_mm_zscore,\n t1.body_mass_g_zscore\nFROM t1\nWHERE\n ABS(t1.bill_length_mm_zscore) > CAST(2 AS TINYINT)\n AND ABS(t1.bill_depth_mm_zscore) > CAST(2 AS TINYINT)\n```\n:::\n:::\n\n\nGood thing you didn't have to write that by hand!\n\n## Summary\n\nThis blog post illustrates the ability to apply computations to many columns at\nonce and the power of ibis as a composable, expressive library for analytics.\n\n- [Get involved!](../../contribute/index.md)\n- [Report issues!](https://github.com/ibis-project/ibis/issues/new/choose)\n\n", + "engine": "jupyter", + "markdown": "---\ntitle: \"Maximizing productivity with selectors\"\nauthor: Phillip Cloud\ndate: 2023-02-27\ncategories:\n - blog\n - new feature\n - productivity\n - duckdb\n---\n\n\nBefore Ibis 5.0 it's been challenging to concisely express whole-table\noperations with ibis. Happily this is no longer the case in ibis 5.0.\n\nLet's jump right in!\n\nWe'll look at selectors examples using the [`palmerpenguins` data\nset](https://allisonhorst.github.io/palmerpenguins/) with the [DuckDB\nbackend](../../backends/duckdb.qmd).\n\n## Setup\n\n::: {#f1f9d63e .cell execution_count=1}\n``` {.python .cell-code}\nfrom ibis.interactive import *\n\nt = ex.penguins.fetch()\nt\n```\n\n::: {.cell-output .cell-output-display execution_count=1}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃\n┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n│ stringstringfloat64float64int64int64stringint64 │\n├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n│ Adelie Torgersen39.118.71813750male  2007 │\n│ Adelie Torgersen39.517.41863800female2007 │\n│ Adelie Torgersen40.318.01953250female2007 │\n│ Adelie TorgersenNULLNULLNULLNULLNULL2007 │\n│ Adelie Torgersen36.719.31933450female2007 │\n│ Adelie Torgersen39.320.61903650male  2007 │\n│ Adelie Torgersen38.917.81813625female2007 │\n│ Adelie Torgersen39.219.61954675male  2007 │\n│ Adelie Torgersen34.118.11933475NULL2007 │\n│ Adelie Torgersen42.020.21904250NULL2007 │\n│  │\n└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘\n
\n```\n:::\n:::\n\n\n## Examples\n\n### Normalization\n\nLet's say you want to compute the\n[z-score](https://en.wikipedia.org/wiki/Standard_score) of every numeric column\nand replace the existing data with that normalized value. Here's how you'd do\nthat with selectors:\n\n::: {#6ab0ac88 .cell execution_count=2}\n``` {.python .cell-code}\nt.mutate(s.across(s.numeric(), (_ - _.mean()) / _.std()))\n```\n\n::: {.cell-output .cell-output-display execution_count=2}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━┓\n┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year      ┃\n┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━┩\n│ stringstringfloat64float64float64float64stringfloat64   │\n├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────────┤\n│ Adelie Torgersen-0.8832050.784300-1.416272-0.563317male  -1.257484 │\n│ Adelie Torgersen-0.8099390.126003-1.060696-0.500969female-1.257484 │\n│ Adelie Torgersen-0.6634080.429833-0.420660-1.186793female-1.257484 │\n│ Adelie TorgersenNULLNULLNULLNULLNULL-1.257484 │\n│ Adelie Torgersen-1.3227991.088129-0.562890-0.937403female-1.257484 │\n│ Adelie Torgersen-0.8465721.746426-0.776236-0.688012male  -1.257484 │\n│ Adelie Torgersen-0.9198370.328556-1.416272-0.719186female-1.257484 │\n│ Adelie Torgersen-0.8648881.240044-0.4206600.590115male  -1.257484 │\n│ Adelie Torgersen-1.7990250.480471-0.562890-0.906229NULL-1.257484 │\n│ Adelie Torgersen-0.3520291.543873-0.7762360.060160NULL-1.257484 │\n│  │\n└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────────┘\n
\n```\n:::\n:::\n\n\n### What's Up With the `year` Column?\n\nWhoops, looks like we included `year` in our normalization because it's an\n`int64` column (and therefore numeric) but normalizing the year doesn't make\nsense.\n\nWe can exclude `year` from the normalization using another selector:\n\n::: {#c193b746 .cell execution_count=3}\n``` {.python .cell-code}\nt.mutate(s.across(s.numeric() & ~s.cols(\"year\"), (_ - _.mean()) / _.std()))\n```\n\n::: {.cell-output .cell-output-display execution_count=3}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃\n┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n│ stringstringfloat64float64float64float64stringint64 │\n├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n│ Adelie Torgersen-0.8832050.784300-1.416272-0.563317male  2007 │\n│ Adelie Torgersen-0.8099390.126003-1.060696-0.500969female2007 │\n│ Adelie Torgersen-0.6634080.429833-0.420660-1.186793female2007 │\n│ Adelie TorgersenNULLNULLNULLNULLNULL2007 │\n│ Adelie Torgersen-1.3227991.088129-0.562890-0.937403female2007 │\n│ Adelie Torgersen-0.8465721.746426-0.776236-0.688012male  2007 │\n│ Adelie Torgersen-0.9198370.328556-1.416272-0.719186female2007 │\n│ Adelie Torgersen-0.8648881.240044-0.4206600.590115male  2007 │\n│ Adelie Torgersen-1.7990250.480471-0.562890-0.906229NULL2007 │\n│ Adelie Torgersen-0.3520291.543873-0.7762360.060160NULL2007 │\n│  │\n└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘\n
\n```\n:::\n:::\n\n\n`cols` selects one or more columns, and the `~` means \"negate\". Combining those\nwe get \"every column except for 'year'\"!\n\nPretty neat right?\n\n### Composable Group By\n\nThe power of this approach comes in when you want the grouped version. Perhaps\nwe think some of these columns vary by species.\n\nWith selectors, all you need to do is slap a `.group_by(\"species\")` onto `t`:\n\n::: {#2e644f2e .cell execution_count=4}\n``` {.python .cell-code}\nt.group_by(\"species\").mutate(\n s.across(s.numeric() & ~s.cols(\"year\"), (_ - _.mean()) / _.std())\n)\n```\n\n::: {.cell-output .cell-output-display execution_count=4}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n┃ species  island  bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃\n┡━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n│ stringstringfloat64float64float64float64stringint64 │\n├─────────┼────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n│ Gentoo Biscoe-0.455854-1.816223-0.954050-1.142626female2007 │\n│ Gentoo Biscoe-0.975022-0.287513-0.491442-0.448342female2009 │\n│ Gentoo Biscoe0.387793-0.898997-1.108253-1.241809female2007 │\n│ Gentoo Biscoe0.8096160.2220560.1253681.237778male  2007 │\n│ Gentoo Biscoe0.030865-0.491341-0.3372400.642677male  2007 │\n│ Gentoo Biscoe-0.326062-1.510481-1.108253-1.043442female2007 │\n│ Gentoo Biscoe-0.682990-0.389427-0.954050-0.547525female2007 │\n│ Gentoo Biscoe-0.2611670.3239700.2795710.245943male  2007 │\n│ Gentoo Biscoe-1.364397-1.612395-1.262455-1.340993female2007 │\n│ Gentoo Biscoe-0.2287190.425884-0.3372400.146759male  2007 │\n│  │\n└─────────┴────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘\n
\n```\n:::\n:::\n\n\nSince ibis translates this into a run-of-the-mill selection as if you had\ncalled `select` or `mutate` without selectors, nothing special is needed for a\nbackend to work with these new constructs.\n\nLet's look at some more examples.\n\n### Min-max Normalization\n\nGrouped min/max normalization? Easy:\n\n::: {#2445c896 .cell execution_count=5}\n``` {.python .cell-code}\nt.group_by(\"species\").mutate(\n s.across(s.numeric() & ~s.cols(\"year\"), (_ - _.min()) / (_.max() - _.min()))\n)\n```\n\n::: {.cell-output .cell-output-display execution_count=5}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n┃ species  island  bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃\n┡━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n│ stringstringfloat64float64float64float64stringint64 │\n├─────────┼────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n│ Gentoo Biscoe0.2780750.0238100.2857140.234043female2007 │\n│ Gentoo Biscoe0.1925130.3809520.3928570.382979female2009 │\n│ Gentoo Biscoe0.4171120.2380950.2500000.212766female2007 │\n│ Gentoo Biscoe0.4866310.5000000.5357140.744681male  2007 │\n│ Gentoo Biscoe0.3582890.3333330.4285710.617021male  2007 │\n│ Gentoo Biscoe0.2994650.0952380.2500000.255319female2007 │\n│ Gentoo Biscoe0.2406420.3571430.2857140.361702female2007 │\n│ Gentoo Biscoe0.3101600.5238100.5714290.531915male  2007 │\n│ Gentoo Biscoe0.1283420.0714290.2142860.191489female2007 │\n│ Gentoo Biscoe0.3155080.5476190.4285710.510638male  2007 │\n│  │\n└─────────┴────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘\n
\n```\n:::\n:::\n\n\n### Casting and Munging\n\nHow about casting every column whose name ends with any of the strings `\"mm\"`\nor `\"g\"` to a `float32`? No problem!\n\n::: {#6fd97a54 .cell execution_count=6}\n``` {.python .cell-code}\nt.mutate(s.across(s.endswith((\"mm\", \"g\")), _.cast(\"float32\")))\n```\n\n::: {.cell-output .cell-output-display execution_count=6}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃\n┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n│ stringstringfloat32float32float32float32stringint64 │\n├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n│ Adelie Torgersen39.09999818.700001181.03750.0male  2007 │\n│ Adelie Torgersen39.50000017.400000186.03800.0female2007 │\n│ Adelie Torgersen40.29999918.000000195.03250.0female2007 │\n│ Adelie TorgersenNULLNULLNULLNULLNULL2007 │\n│ Adelie Torgersen36.70000119.299999193.03450.0female2007 │\n│ Adelie Torgersen39.29999920.600000190.03650.0male  2007 │\n│ Adelie Torgersen38.90000217.799999181.03625.0female2007 │\n│ Adelie Torgersen39.20000119.600000195.04675.0male  2007 │\n│ Adelie Torgersen34.09999818.100000193.03475.0NULL2007 │\n│ Adelie Torgersen42.00000020.200001190.04250.0NULL2007 │\n│  │\n└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘\n
\n```\n:::\n:::\n\n\nWe can make all string columns have the same case too!\n\n::: {#e421d4d9 .cell execution_count=7}\n``` {.python .cell-code}\nt.mutate(s.across(s.of_type(\"string\"), _.lower()))\n```\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃\n┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n│ stringstringfloat64float64int64int64stringint64 │\n├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n│ adelie torgersen39.118.71813750male  2007 │\n│ adelie torgersen39.517.41863800female2007 │\n│ adelie torgersen40.318.01953250female2007 │\n│ adelie torgersenNULLNULLNULLNULLNULL2007 │\n│ adelie torgersen36.719.31933450female2007 │\n│ adelie torgersen39.320.61903650male  2007 │\n│ adelie torgersen38.917.81813625female2007 │\n│ adelie torgersen39.219.61954675male  2007 │\n│ adelie torgersen34.118.11933475NULL2007 │\n│ adelie torgersen42.020.21904250NULL2007 │\n│  │\n└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘\n
\n```\n:::\n:::\n\n\n### Multiple Computations per Column\n\nWhat if I want to compute multiple things? Heck yeah!\n\n::: {#af6d176e .cell execution_count=8}\n``` {.python .cell-code}\nt.group_by(\"sex\").mutate(\n s.across(\n s.numeric() & ~s.cols(\"year\"),\n dict(centered=_ - _.mean(), zscore=(_ - _.mean()) / _.std()),\n )\n).select(\"sex\", s.endswith((\"_centered\", \"_zscore\")))\n```\n\n::: {.cell-output .cell-output-display execution_count=8}\n```{=html}\n
┏━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n┃ sex     bill_length_mm_centered  bill_depth_mm_centered  flipper_length_mm_centered  body_mass_g_centered  bill_length_mm_zscore  bill_depth_mm_zscore  flipper_length_mm_zscore  body_mass_g_zscore ┃\n┡━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n│ stringfloat64float64float64float64float64float64float64float64            │\n├────────┼─────────────────────────┼────────────────────────┼────────────────────────────┼──────────────────────┼───────────────────────┼──────────────────────┼──────────────────────────┼────────────────────┤\n│ male  0.445238-2.09107110.494048504.3154760.082960-1.1222100.7213460.640296 │\n│ male  2.245238-2.7910714.494048954.3154760.418349-1.4978780.3089141.211631 │\n│ male  -6.2547620.208929-18.505952-95.684524-1.1654340.112125-1.272072-0.121484 │\n│ male  -5.0547621.0089293.494048-245.684524-0.9418410.5414590.240176-0.311929 │\n│ male  -11.2547623.208929-6.505952-145.684524-2.0970711.722128-0.447210-0.184966 │\n│ male  -3.3547622.808929-7.505952-45.684524-0.6250841.507461-0.515948-0.058003 │\n│ male  0.1452383.608929-10.505952-345.6845240.0270621.936795-0.722164-0.438893 │\n│ male  -8.1547620.808929-24.505952-945.684524-1.5194560.434126-1.684504-1.200673 │\n│ male  -7.6547620.208929-19.505952-595.684524-1.4262920.112125-1.340811-0.756301 │\n│ male  -7.054762-0.691071-24.505952-745.684524-1.314496-0.370876-1.684504-0.946746 │\n│  │\n└────────┴─────────────────────────┴────────────────────────┴────────────────────────────┴──────────────────────┴───────────────────────┴──────────────────────┴──────────────────────────┴────────────────────┘\n
\n```\n:::\n:::\n\n\nDon't like the naming convention?\n\nPass a function to make your own name!\n\n::: {#dea5a71f .cell execution_count=9}\n``` {.python .cell-code}\nt.select(s.startswith(\"bill\")).mutate(\n s.across(\n s.all(),\n dict(x=_ - _.mean(), y=_.max()),\n names=lambda col, fn: f\"{col}_{fn}_improved\",\n )\n)\n```\n\n::: {.cell-output .cell-output-display execution_count=9}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n┃ bill_length_mm  bill_depth_mm  bill_length_mm_x_improved  bill_depth_mm_x_improved  bill_length_mm_y_improved  bill_depth_mm_y_improved ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n│ float64float64float64float64float64float64                  │\n├────────────────┼───────────────┼───────────────────────────┼──────────────────────────┼───────────────────────────┼──────────────────────────┤\n│           39.118.7-4.821931.5488359.621.5 │\n│           39.517.4-4.421930.2488359.621.5 │\n│           40.318.0-3.621930.8488359.621.5 │\n│           NULLNULLNULLNULL59.621.5 │\n│           36.719.3-7.221932.1488359.621.5 │\n│           39.320.6-4.621933.4488359.621.5 │\n│           38.917.8-5.021930.6488359.621.5 │\n│           39.219.6-4.721932.4488359.621.5 │\n│           34.118.1-9.821930.9488359.621.5 │\n│           42.020.2-1.921933.0488359.621.5 │\n│               │\n└────────────────┴───────────────┴───────────────────────────┴──────────────────────────┴───────────────────────────┴──────────────────────────┘\n
\n```\n:::\n:::\n\n\nDon't like lambda functions? We support a format string too!\n\n::: {#123b7382 .cell execution_count=10}\n``` {.python .cell-code}\nt.select(s.startswith(\"bill\")).mutate(\n s.across(\n s.all(),\n func=dict(x=_ - _.mean(), y=_.max()),\n names=\"{col}_{fn}_improved\",\n )\n).head(2)\n```\n\n::: {.cell-output .cell-output-display execution_count=10}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n┃ bill_length_mm  bill_depth_mm  bill_length_mm_x_improved  bill_depth_mm_x_improved  bill_length_mm_y_improved  bill_depth_mm_y_improved ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n│ float64float64float64float64float64float64                  │\n├────────────────┼───────────────┼───────────────────────────┼──────────────────────────┼───────────────────────────┼──────────────────────────┤\n│           39.118.7-4.821931.5488359.621.5 │\n│           39.517.4-4.421930.2488359.621.5 │\n└────────────────┴───────────────┴───────────────────────────┴──────────────────────────┴───────────────────────────┴──────────────────────────┘\n
\n```\n:::\n:::\n\n\n### Working with other Ibis APIs\n\nWe've seen lots of mutate use, but selectors also work with `.agg`:\n\n::: {#898ed06b .cell execution_count=11}\n``` {.python .cell-code}\nt.group_by(\"year\").agg(s.across(s.numeric() & ~s.cols(\"year\"), _.mean())).order_by(\"year\")\n```\n\n::: {.cell-output .cell-output-display execution_count=11}\n```{=html}\n
┏━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ year   bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g ┃\n┡━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ int64float64float64float64float64     │\n├───────┼────────────────┼───────────────┼───────────────────┼─────────────┤\n│  200743.74036717.427523196.8807344124.541284 │\n│  200843.54122816.914035202.7982464266.666667 │\n│  200944.45294117.125210202.8067234210.294118 │\n└───────┴────────────────┴───────────────┴───────────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nNaturally, selectors work in grouping keys too, for even more convenience:\n\n::: {#27d455be .cell execution_count=12}\n``` {.python .cell-code}\nt.group_by(~s.numeric() | s.cols(\"year\")).mutate(\n s.across(s.numeric() & ~s.cols(\"year\"), dict(centered=_ - _.mean(), std=_.std()))\n).select(\"species\", s.endswith((\"_centered\", \"_std\")))\n```\n\n::: {.cell-output .cell-output-display execution_count=12}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓\n┃ species  bill_length_mm_centered  bill_depth_mm_centered  flipper_length_mm_centered  body_mass_g_centered  bill_length_mm_std  bill_depth_mm_std  flipper_length_mm_std  body_mass_g_std ┃\n┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩\n│ stringfloat64float64float64float64float64float64float64float64         │\n├─────────┼─────────────────────────┼────────────────────────┼────────────────────────────┼──────────────────────┼────────────────────┼───────────────────┼───────────────────────┼─────────────────┤\n│ Adelie -3.18-1.33-10.6-202.51.9274621.0604516.310485372.015905 │\n│ Adelie 0.52-0.53-4.6-202.51.9274621.0604516.310485372.015905 │\n│ Adelie -1.181.677.447.51.9274621.0604516.310485372.015905 │\n│ Adelie -1.580.571.4-152.51.9274621.0604516.310485372.015905 │\n│ Adelie -0.58-0.33-4.6547.51.9274621.0604516.310485372.015905 │\n│ Adelie 0.42-1.036.4-202.51.9274621.0604516.310485372.015905 │\n│ Adelie 3.720.277.4297.51.9274621.0604516.310485372.015905 │\n│ Adelie -0.78-0.631.4497.51.9274621.0604516.310485372.015905 │\n│ Adelie 0.72-0.43-6.6-677.51.9274621.0604516.310485372.015905 │\n│ Adelie 1.921.772.447.51.9274621.0604516.310485372.015905 │\n│  │\n└─────────┴─────────────────────────┴────────────────────────┴────────────────────────────┴──────────────────────┴────────────────────┴───────────────────┴───────────────────────┴─────────────────┘\n
\n```\n:::\n:::\n\n\n### Filtering Selectors\n\nYou can also express complex filters more concisely.\n\nLet's say we only want to keep rows where all the bill size z-score related\ncolumns' absolute values are greater than 2.\n\n::: {#9973769d .cell execution_count=13}\n``` {.python .cell-code}\nt.drop(\"year\").group_by(\"species\").mutate(\n s.across(s.numeric(), dict(zscore=(_ - _.mean()) / _.std()))\n).filter(s.if_all(s.startswith(\"bill\") & s.endswith(\"_zscore\"), _.abs() > 2))\n```\n\n::: {.cell-output .cell-output-display execution_count=13}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     bill_length_mm_zscore  bill_depth_mm_zscore  flipper_length_mm_zscore  body_mass_g_zscore ┃\n┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n│ stringstringfloat64float64int64int64stringfloat64float64float64float64            │\n├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────────────────────┼──────────────────────┼──────────────────────────┼────────────────────┤\n│ Gentoo Biscoe   59.617.02306050male  3.9246212.0565081.9757991.932062 │\n│ Gentoo Biscoe   55.917.02285600male  2.7240462.0565081.6673941.039411 │\n│ Adelie Torgersen46.021.51944200male  2.7065392.5920710.6187601.088911 │\n│ Adelie Dream    32.115.51883050female-2.512345-2.339505-0.298747-1.418906 │\n└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────────────────────┴──────────────────────┴──────────────────────────┴────────────────────┘\n
\n```\n:::\n:::\n\n\n### Bonus: Generated SQL\n\nThe SQL for that last expression is pretty gnarly:\n\n::: {#45ec3aa9 .cell execution_count=14}\n``` {.python .cell-code}\nibis.to_sql(\n t.drop(\"year\")\n .group_by(\"species\")\n .mutate(s.across(s.numeric(), dict(zscore=(_ - _.mean()) / _.std())))\n .filter(s.if_all(s.startswith(\"bill\") & s.endswith(\"_zscore\"), _.abs() > 2))\n)\n```\n\n::: {.cell-output .cell-output-display .cell-output-markdown execution_count=14}\n```sql\nSELECT\n *\nFROM (\n SELECT\n \"t1\".\"species\",\n \"t1\".\"island\",\n \"t1\".\"bill_length_mm\",\n \"t1\".\"bill_depth_mm\",\n \"t1\".\"flipper_length_mm\",\n \"t1\".\"body_mass_g\",\n \"t1\".\"sex\",\n (\n \"t1\".\"bill_length_mm\" - AVG(\"t1\".\"bill_length_mm\") OVER (PARTITION BY \"t1\".\"species\" ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)\n ) / STDDEV_SAMP(\"t1\".\"bill_length_mm\") OVER (PARTITION BY \"t1\".\"species\" ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS \"bill_length_mm_zscore\",\n (\n \"t1\".\"bill_depth_mm\" - AVG(\"t1\".\"bill_depth_mm\") OVER (PARTITION BY \"t1\".\"species\" ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)\n ) / STDDEV_SAMP(\"t1\".\"bill_depth_mm\") OVER (PARTITION BY \"t1\".\"species\" ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS \"bill_depth_mm_zscore\",\n (\n \"t1\".\"flipper_length_mm\" - AVG(\"t1\".\"flipper_length_mm\") OVER (PARTITION BY \"t1\".\"species\" ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)\n ) / STDDEV_SAMP(\"t1\".\"flipper_length_mm\") OVER (PARTITION BY \"t1\".\"species\" ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS \"flipper_length_mm_zscore\",\n (\n \"t1\".\"body_mass_g\" - AVG(\"t1\".\"body_mass_g\") OVER (PARTITION BY \"t1\".\"species\" ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)\n ) / STDDEV_SAMP(\"t1\".\"body_mass_g\") OVER (PARTITION BY \"t1\".\"species\" ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS \"body_mass_g_zscore\"\n FROM (\n SELECT\n \"t0\".*\n EXCLUDE (\"year\")\n FROM \"penguins\" AS \"t0\"\n ) AS \"t1\"\n) AS \"t2\"\nWHERE\n ABS(\"t2\".\"bill_length_mm_zscore\") > 2 AND ABS(\"t2\".\"bill_depth_mm_zscore\") > 2\n```\n:::\n:::\n\n\nGood thing you didn't have to write that by hand!\n\n## Summary\n\nThis blog post illustrates the ability to apply computations to many columns at\nonce and the power of ibis as a composable, expressive library for analytics.\n\n- [Get involved!](../../contribute/index.md)\n- [Report issues!](https://github.com/ibis-project/ibis/issues/new/choose)\n\n", "supporting": [ - "index_files/figure-html" + "index_files" ], "filters": [], "includes": { "include-in-header": [ - "\n\n\n" + "\n\n\n" ] } } diff --git a/docs/_quarto.yml b/docs/_quarto.yml index 61132bc010df..dc82594e7fdc 100644 --- a/docs/_quarto.yml +++ b/docs/_quarto.yml @@ -572,14 +572,15 @@ quartodoc: - matches - any_of - all_of - - c + - cols - across - if_any - if_all - - r + - index - first - last - all + - none - title: Type System desc: "Data types and schemas" diff --git a/docs/how-to/visualization/matplotlib.qmd b/docs/how-to/visualization/matplotlib.qmd index 8fdf5c29b634..7d2e3054e008 100644 --- a/docs/how-to/visualization/matplotlib.qmd +++ b/docs/how-to/visualization/matplotlib.qmd @@ -24,7 +24,7 @@ grouped = t.group_by("species").aggregate(count=ibis._.count()) grouped = grouped.mutate(row_number=ibis.row_number().over()).select( "row_number", ( - ~s.c("row_number") & s.all() + ~s.cols("row_number") & s.all() ), # see https://github.com/ibis-project/ibis/issues/6803 ) grouped diff --git a/docs/posts/ibis-to-file/index.qmd b/docs/posts/ibis-to-file/index.qmd index 5737e0ebf7e2..dfcbc458ebcf 100644 --- a/docs/posts/ibis-to-file/index.qmd +++ b/docs/posts/ibis-to-file/index.qmd @@ -38,7 +38,7 @@ import ibis.selectors as s expr = ( t.group_by("species") - .mutate(s.across(s.numeric() & ~s.c("year"), (_ - _.mean()) / _.std())) + .mutate(s.across(s.numeric() & ~s.cols("year"), (_ - _.mean()) / _.std())) ) expr ``` diff --git a/docs/posts/selectors/index.qmd b/docs/posts/selectors/index.qmd index e4797da1eb0a..cd134ffc6389 100644 --- a/docs/posts/selectors/index.qmd +++ b/docs/posts/selectors/index.qmd @@ -49,10 +49,11 @@ sense. We can exclude `year` from the normalization using another selector: ```{python} -t.mutate(s.across(s.numeric() & ~s.c("year"), (_ - _.mean()) / _.std())) +t.mutate(s.across(s.numeric() & ~s.cols("year"), (_ - _.mean()) / _.std())) ``` -`c` is short for "column" and the `~` means "negate". Combining those we get "not the year column"! +`cols` selects one or more columns, and the `~` means "negate". Combining those +we get "every column except for 'year'"! Pretty neat right? @@ -65,7 +66,7 @@ With selectors, all you need to do is slap a `.group_by("species")` onto `t`: ```{python} t.group_by("species").mutate( - s.across(s.numeric() & ~s.c("year"), (_ - _.mean()) / _.std()) + s.across(s.numeric() & ~s.cols("year"), (_ - _.mean()) / _.std()) ) ``` @@ -81,7 +82,7 @@ Grouped min/max normalization? Easy: ```{python} t.group_by("species").mutate( - s.across(s.numeric() & ~s.c("year"), (_ - _.min()) / (_.max() - _.min())) + s.across(s.numeric() & ~s.cols("year"), (_ - _.min()) / (_.max() - _.min())) ) ``` @@ -107,7 +108,7 @@ What if I want to compute multiple things? Heck yeah! ```{python} t.group_by("sex").mutate( s.across( - s.numeric() & ~s.c("year"), + s.numeric() & ~s.cols("year"), dict(centered=_ - _.mean(), zscore=(_ - _.mean()) / _.std()), ) ).select("sex", s.endswith(("_centered", "_zscore"))) @@ -144,14 +145,14 @@ t.select(s.startswith("bill")).mutate( We've seen lots of mutate use, but selectors also work with `.agg`: ```{python} -t.group_by("year").agg(s.across(s.numeric() & ~s.c("year"), _.mean())).order_by("year") +t.group_by("year").agg(s.across(s.numeric() & ~s.cols("year"), _.mean())).order_by("year") ``` Naturally, selectors work in grouping keys too, for even more convenience: ```{python} -t.group_by(~s.numeric() | s.c("year")).mutate( - s.across(s.numeric() & ~s.c("year"), dict(centered=_ - _.mean(), std=_.std())) +t.group_by(~s.numeric() | s.cols("year")).mutate( + s.across(s.numeric() & ~s.cols("year"), dict(centered=_ - _.mean(), std=_.std())) ).select("species", s.endswith(("_centered", "_std"))) ``` diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index 23f5d83fdd92..3d3adfe586d5 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -1345,7 +1345,7 @@ def test_memtable_column_naming_mismatch(con, monkeypatch, df, columns): def test_pivot_longer(backend): diamonds = backend.diamonds df = diamonds.execute() - res = diamonds.pivot_longer(s.c("x", "y", "z"), names_to="pos", values_to="xyz") + res = diamonds.pivot_longer(s.cols("x", "y", "z"), names_to="pos", values_to="xyz") assert res.schema().names == ( "carat", "cut", @@ -2469,7 +2469,7 @@ def test_union_generates_predictable_aliases(con): assert len(df) == 2 -@pytest.mark.parametrize("id_cols", [s.none(), [], s.c()]) +@pytest.mark.parametrize("id_cols", [s.none(), [], s.cols()]) def test_pivot_wider_empty_id_columns(con, backend, id_cols, monkeypatch): monkeypatch.setattr(ibis.options, "default_backend", con) data = pd.DataFrame( diff --git a/ibis/backends/tests/tpc/ds/test_queries.py b/ibis/backends/tests/tpc/ds/test_queries.py index 540503cd42e3..e6fd2b3272c3 100644 --- a/ibis/backends/tests/tpc/ds/test_queries.py +++ b/ibis/backends/tests/tpc/ds/test_queries.py @@ -1341,7 +1341,7 @@ def test_24(store_sales, store_returns, store, item, customer, customer_address) .group_by(_.c_last_name, _.c_first_name, _.s_store_name) .having(_.netpaid.sum() > ssales.netpaid.mean().as_scalar() * 0.05) .agg(paid=_.netpaid.sum()) - .order_by(~s.c("paid")) + .order_by(~s.cols("paid")) ) @@ -1497,17 +1497,17 @@ def test_28(store_sales): def test_29(store_sales, store_returns, catalog_sales, date_dim, store, item): d1 = ( date_dim.filter(_.d_moy == 9, _.d_year == 1999) - .drop(~s.c("d_date_sk")) + .drop(~s.cols("d_date_sk")) .rename(d1_date_sk="d_date_sk") ) d2 = ( date_dim.filter(_.d_moy.between(9, 9 + 3), _.d_year == 1999) - .drop(~s.c("d_date_sk")) + .drop(~s.cols("d_date_sk")) .rename(d2_date_sk="d_date_sk") ) d3 = ( date_dim.filter(_.d_year.isin((1999, 1999 + 1, 1999 + 2))) - .drop(~s.c("d_date_sk")) + .drop(~s.cols("d_date_sk")) .rename(d3_date_sk="d_date_sk") ) return ( @@ -1864,7 +1864,7 @@ def test_35( .relocate("cd_dep_employed_count", before="cnt2") .relocate("cd_dep_college_count", before="cnt3") .order_by( - s.across(s.startswith("cd_") | s.c("ca_state"), _.asc(nulls_first=True)) + s.across(s.startswith("cd_") | s.cols("ca_state"), _.asc(nulls_first=True)) ) .limit(100) ) @@ -1894,7 +1894,7 @@ def test_36(store_sales, date_dim, item, store): g_category=lit(0), g_class=lit(0), ) - .relocate(s.c("i_category", "i_class"), after="gross_margin") + .relocate(s.cols("i_category", "i_class"), after="gross_margin") ) return ( results.select( @@ -2035,7 +2035,9 @@ def test_39(inventory, item, warehouse, date_dim): ) .order_by( s.across( - s.c("wsk1", "isk1", "dmoy1", "mean1", "cov1", "d_moy", "mean", "cov"), + s.cols( + "wsk1", "isk1", "dmoy1", "mean1", "cov1", "d_moy", "mean", "cov" + ), _.asc(nulls_first=True), ) ) @@ -2169,7 +2171,7 @@ def test_42(date_dim, store_sales, item): .join(item.filter(_.i_manager_id == 1), [("ss_item_sk", "i_item_sk")]) .group_by(_.d_year, _.i_category_id, _.i_category) .agg(total_sales=_.ss_ext_sales_price.sum()) - .order_by(_.total_sales.desc(), ~s.c("total_sales")) + .order_by(_.total_sales.desc(), ~s.cols("total_sales")) .limit(100) ) @@ -2268,7 +2270,7 @@ def test_45(web_sales, customer, customer_address, date_dim, item): ) .group_by(_.ca_zip, _.ca_city) .agg(total_web_sales=_.ws_sales_price.sum()) - .order_by(~s.c("total_web_sales")) + .order_by(~s.cols("total_web_sales")) .limit(100) ) @@ -2318,7 +2320,7 @@ def test_46( _.amt, _.profit, ) - .order_by(s.across(~s.c("amt", "profit"), _.asc(nulls_first=True))) + .order_by(s.across(~s.cols("amt", "profit"), _.asc(nulls_first=True))) .limit(100) ) @@ -2346,7 +2348,7 @@ def test_47(item, store_sales, date_dim, store): .mutate( avg_monthly_sales=_.sum_sales.mean().over( # TODO: add support for selectors in window over specification - # group_by=~s.c("sum_sales", "d_moy") + # group_by=~s.cols("sum_sales", "d_moy") group_by=( _.i_category, _.i_brand, @@ -2966,7 +2968,9 @@ def test_57(item, catalog_sales, date_dim, call_center): ) > 0.1, ) - .order_by((_.sum_sales - _.avg_monthly_sales).asc(nulls_first=True), s.r[1:10]) + .order_by( + (_.sum_sales - _.avg_monthly_sales).asc(nulls_first=True), s.index[1:10] + ) .limit(100) ) @@ -4885,7 +4889,7 @@ def test_89(item, store_sales, date_dim, store): .order_by( _.sum_sales - _.avg_monthly_sales, _.s_store_name, - s.r[:9] & ~s.c("s_store_name"), + s.index[:9] & ~s.cols("s_store_name"), ) ).limit(100) diff --git a/ibis/expr/types/relations.py b/ibis/expr/types/relations.py index 56745f33be7c..00f8c5e6ea04 100644 --- a/ibis/expr/types/relations.py +++ b/ibis/expr/types/relations.py @@ -1881,7 +1881,7 @@ def mutate(self, *exprs: Sequence[ir.Expr] | None, **mutations: ir.Value) -> Tab Mutate across multiple columns - >>> t.mutate(s.across(s.numeric() & ~s.c("year"), _ - _.mean())).head() + >>> t.mutate(s.across(s.numeric() & ~s.cols("year"), _ - _.mean())).head() ┏━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━━━━┓ ┃ species ┃ year ┃ bill_length_mm ┃ ┡━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━━━━┩ @@ -2051,7 +2051,7 @@ def select( Projection with a selector >>> import ibis.selectors as s - >>> t.select(s.numeric() & ~s.c("year")).head() + >>> t.select(s.numeric() & ~s.cols("year")).head() ┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ body_mass_g ┃ ┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ @@ -2067,7 +2067,7 @@ def select( Projection + aggregation across multiple columns >>> from ibis import _ - >>> t.select(s.across(s.numeric() & ~s.c("year"), _.mean())).head() + >>> t.select(s.across(s.numeric() & ~s.cols("year"), _.mean())).head() ┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ body_mass_g ┃ ┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ @@ -2161,7 +2161,7 @@ def rename( >>> import ibis >>> import ibis.selectors as s >>> ibis.options.interactive = True - >>> first3 = s.r[:3] # first 3 columns + >>> first3 = s.index[:3] # first 3 columns >>> t = ibis.examples.penguins_raw_raw.fetch().select(first3) >>> t ┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ @@ -3597,7 +3597,7 @@ def pivot_longer( Here we convert column names not matching the selector for the `religion` column and convert those names into values - >>> relig_income.pivot_longer(~s.c("religion"), names_to="income", values_to="count") + >>> relig_income.pivot_longer(~s.cols("religion"), names_to="income", values_to="count") ┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓ ┃ religion ┃ income ┃ count ┃ ┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩ @@ -3718,7 +3718,7 @@ def pivot_longer( >>> len(who.columns) 60 >>> who.pivot_longer( - ... s.r["new_sp_m014":"newrel_f65"], + ... s.index["new_sp_m014":"newrel_f65"], ... names_to=["diagnosis", "gender", "age"], ... names_pattern="new_?(.*)_(.)(.*)", ... values_to="count", @@ -3749,7 +3749,7 @@ def pivot_longer( Let's recode gender and age to numeric values using a mapping >>> who.pivot_longer( - ... s.r["new_sp_m014":"newrel_f65"], + ... s.index["new_sp_m014":"newrel_f65"], ... names_to=["diagnosis", "gender", "age"], ... names_pattern="new_?(.*)_(.)(.*)", ... names_transform=dict( @@ -3784,7 +3784,7 @@ def pivot_longer( The number of match groups in `names_pattern` must match the length of `names_to` >>> who.pivot_longer( # quartodoc: +EXPECTED_FAILURE - ... s.r["new_sp_m014":"newrel_f65"], + ... s.index["new_sp_m014":"newrel_f65"], ... names_to=["diagnosis", "gender", "age"], ... names_pattern="new_?(.*)_.(.*)", ... ) @@ -3795,7 +3795,7 @@ def pivot_longer( `names_transform` must be a mapping or callable >>> who.pivot_longer( - ... s.r["new_sp_m014":"newrel_f65"], names_transform="upper" + ... s.index["new_sp_m014":"newrel_f65"], names_transform="upper" ... ) # quartodoc: +EXPECTED_FAILURE Traceback (most recent call last): ... @@ -4429,14 +4429,6 @@ def relocate( ├────────┼────────┼────────┼───────┼───────┼───────┤ │ a │ a │ a │ 1 │ 1 │ 1 │ └────────┴────────┴────────┴───────┴───────┴───────┘ - >>> t.relocate(s.any_of(s.c(*"ae"))) - ┏━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ - ┃ a ┃ e ┃ b ┃ c ┃ d ┃ f ┃ - ┡━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ - │ int64 │ string │ int64 │ int64 │ string │ string │ - ├───────┼────────┼───────┼───────┼────────┼────────┤ - │ 1 │ a │ 1 │ 1 │ a │ a │ - └───────┴────────┴───────┴───────┴────────┴────────┘ When multiple columns are selected with `before` or `after`, those selected columns are moved before and after the `selectors` input diff --git a/ibis/selectors.py b/ibis/selectors.py index 14d80ed0f4cb..3e0acf0da4bd 100644 --- a/ibis/selectors.py +++ b/ibis/selectors.py @@ -72,6 +72,20 @@ from ibis.common.typing import VarTuple # noqa: TCH001 +def __getattr__(name): + if name == "c": + util.warn_deprecated( + "c", instead="use `ibis.selectors.cols` instead", as_of="9.5" + ) + return cols + elif name == "r": + util.warn_deprecated( + "r", instead="use `ibis.selectors.index` instead", as_of="9.5" + ) + return index + raise AttributeError(name) + + class Where(Selector): predicate: Callable[[ir.Value], bool] @@ -381,8 +395,27 @@ def expand_names(self, table: ir.Table) -> frozenset[str]: @public -def c(*names: str | ir.Column) -> Selector: - """Select specific column names.""" +def cols(*names: str | ir.Column) -> Selector: + """Select specific column names. + + Parameters + ---------- + names + The column names to select + + Examples + -------- + >>> import ibis + >>> import ibis.selectors as s + >>> t = ibis.table({"a": "int", "b": "int", "c": "int"}) + >>> expr = t.select(s.cols("a", "b")) + >>> expr.columns + ['a', 'b'] + + See Also + -------- + [`index`](#ibis.selectors.cols) + """ names = frozenset(col if isinstance(col, str) else col.get_name() for col in names) return Cols(names) @@ -605,7 +638,7 @@ class Slice(Concrete): step: int | None = None -class ColumnSlice(Selector): +class ColumnIndex(Selector): key: str | int | Slice | VarTuple[int | str] @staticmethod @@ -639,15 +672,48 @@ def expand_names(self, table: ir.Table) -> frozenset[str]: return frozenset(iterable) -class Sliceable(Singleton): +class Indexable(Singleton): def __getitem__(self, key: str | int | slice | Iterable[int | str]): if isinstance(key, slice): key = Slice(key.start, key.stop, key.step) - return ColumnSlice(key) + return ColumnIndex(key) + + +index = Indexable() +"""Select columns by index. + +Examples +-------- +>>> import ibis +>>> import ibis.selectors as s +>>> t = ibis.table( +... {"a": "int", "b": "int", "c": "int", "d": "int", "e": "int"} +... ) +Select one column by numeric index: +>>> expr = t.select(s.index[0]) +>>> expr.columns +['a'] + +Select multiple columns by numeric index: +>>> expr = t.select(s.index[[0, 1]]) +>>> expr.columns +['a', 'b'] + +Select a slice of columns by numeric index: +>>> expr = t.select(s.index[1:4]) +>>> expr.columns +['b', 'c', 'd'] + +Select a slice of columns by name: +>>> expr = t.select(s.index["b":"d"]) +>>> expr.columns +['b', 'c', 'd'] -r = Sliceable() -"""Ranges of columns.""" +See Also +-------- +[`cols`](#ibis.selectors.cols) +""" class First(Singleton, Selector): @@ -713,9 +779,9 @@ def _to_selector( if isinstance(obj, Selector): return obj elif isinstance(obj, ir.Column): - return c(obj.get_name()) + return cols(obj.get_name()) elif isinstance(obj, str): - return c(obj) + return cols(obj) elif isinstance(obj, Expandable): raise exc.IbisInputError( f"Cannot compose {obj.__class__.__name__} with other selectors" diff --git a/ibis/tests/benchmarks/test_benchmarks.py b/ibis/tests/benchmarks/test_benchmarks.py index ee5a76b8d160..d90974c9af96 100644 --- a/ibis/tests/benchmarks/test_benchmarks.py +++ b/ibis/tests/benchmarks/test_benchmarks.py @@ -989,7 +989,7 @@ def test_duckdb_timestamp_conversion(benchmark, con): def test_selectors(benchmark, cols): t = ibis.table(name="t", schema={f"col{i}": "int" for i in range(cols)}) n = cols - cols // 10 - sel = s.across(s.c(*[f"col{i}" for i in range(n)]), lambda c: c.cast("str")) + sel = s.across(s.cols(*[f"col{i}" for i in range(n)]), lambda c: c.cast("str")) benchmark(sel.expand, t) diff --git a/ibis/tests/expr/test_relocate.py b/ibis/tests/expr/test_relocate.py index deddaafc2885..d28e339bfe72 100644 --- a/ibis/tests/expr/test_relocate.py +++ b/ibis/tests/expr/test_relocate.py @@ -27,8 +27,8 @@ def test_duplicates_not_renamed(): def test_keep_non_contiguous_variables(): t = ibis.table(dict.fromkeys("abcde", "int")) - assert t.relocate("b", after=s.c("a", "c", "e")).columns == list("acdeb") - assert t.relocate("e", before=s.c("b", "d")).columns == list("aebcd") + assert t.relocate("b", after=s.cols("a", "c", "e")).columns == list("acdeb") + assert t.relocate("e", before=s.cols("b", "d")).columns == list("aebcd") def test_before_after_does_not_move_to_front(): diff --git a/ibis/tests/expr/test_selectors.py b/ibis/tests/expr/test_selectors.py index c254ab3f8886..1761b1d50edd 100644 --- a/ibis/tests/expr/test_selectors.py +++ b/ibis/tests/expr/test_selectors.py @@ -29,6 +29,13 @@ def t(): ) +@pytest.mark.parametrize("name,sol", [("c", s.cols), ("r", s.index)]) +def test_deprecated(name, sol): + with pytest.warns(FutureWarning): + res = getattr(s, name) + assert res is sol + + @pytest.mark.parametrize( "sel", [s.where(lambda _: False), s.startswith("X"), s.endswith("🙂")], @@ -159,13 +166,13 @@ def zscore(c): "expr_func", [ lambda t: t.select( - s.across(s.numeric() & ~s.c("year"), (_ - _.mean()) / _.std()) + s.across(s.numeric() & ~s.cols("year"), (_ - _.mean()) / _.std()) ), - lambda t: t.select(s.across(s.numeric() & ~s.c("year"), zscore)), + lambda t: t.select(s.across(s.numeric() & ~s.cols("year"), zscore)), lambda t: t.select( - s.across(s.numeric() & ~s.c(t.year), (_ - _.mean()) / _.std()) + s.across(s.numeric() & ~s.cols(t.year), (_ - _.mean()) / _.std()) ), - lambda t: t.select(s.across(s.numeric() & ~s.c(t.year), zscore)), + lambda t: t.select(s.across(s.numeric() & ~s.cols(t.year), zscore)), ], ids=["deferred", "func", "deferred-column-ref", "func-column-ref"], ) @@ -184,9 +191,9 @@ def test_across_select(penguins, expr_func): "expr_func", [ lambda t: t.mutate( - s.across(s.numeric() & ~s.c("year"), (_ - _.mean()) / _.std()) + s.across(s.numeric() & ~s.cols("year"), (_ - _.mean()) / _.std()) ), - lambda t: t.mutate(s.across(s.numeric() & ~s.c("year"), zscore)), + lambda t: t.mutate(s.across(s.numeric() & ~s.cols("year"), zscore)), ], ids=["deferred", "func"], ) @@ -204,8 +211,8 @@ def test_across_mutate(penguins, expr_func): @pytest.mark.parametrize( "expr_func", [ - lambda t: t.agg(s.across(s.numeric() & ~s.c("year"), _.mean())), - lambda t: t.agg(s.across(s.numeric() & ~s.c("year"), lambda c: c.mean())), + lambda t: t.agg(s.across(s.numeric() & ~s.cols("year"), _.mean())), + lambda t: t.agg(s.across(s.numeric() & ~s.cols("year"), lambda c: c.mean())), ], ids=["deferred", "func"], ) @@ -224,10 +231,10 @@ def test_across_agg(penguins, expr_func): "expr_func", [ lambda t: t.group_by("species").select( - s.across(s.numeric() & ~s.c("year"), (_ - _.mean()) / _.std()) + s.across(s.numeric() & ~s.cols("year"), (_ - _.mean()) / _.std()) ), lambda t: t.group_by("species").select( - s.across(s.numeric() & ~s.c("year"), zscore) + s.across(s.numeric() & ~s.cols("year"), zscore) ), ], ids=["deferred", "func"], @@ -247,10 +254,10 @@ def test_across_group_by_select(penguins, expr_func): "expr_func", [ lambda t: t.group_by("species").mutate( - s.across(s.numeric() & ~s.c("year"), (_ - _.mean()) / _.std()) + s.across(s.numeric() & ~s.cols("year"), (_ - _.mean()) / _.std()) ), lambda t: t.group_by("species").mutate( - s.across(s.numeric() & ~s.c("year"), zscore) + s.across(s.numeric() & ~s.cols("year"), zscore) ), ], ids=["deferred", "func"], @@ -270,10 +277,10 @@ def test_across_group_by_mutate(penguins, expr_func): "expr_func", [ lambda t: t.group_by("species").agg( - s.across(s.numeric() & ~s.c("year"), _.mean()) + s.across(s.numeric() & ~s.cols("year"), _.mean()) ), lambda t: t.group_by("species").agg( - s.across(s.numeric() & ~s.c("year"), lambda c: c.mean()) + s.across(s.numeric() & ~s.cols("year"), lambda c: c.mean()) ), ], ids=["deferred", "func"], @@ -293,10 +300,10 @@ def test_across_group_by_agg(penguins, expr_func): "expr_func", [ lambda t: t.group_by(~s.numeric()).agg( - s.across(s.numeric() & ~s.c("year"), _.mean()) + s.across(s.numeric() & ~s.cols("year"), _.mean()) ), lambda t: t.group_by(~s.numeric()).agg( - s.across(s.numeric() & ~s.c("year"), lambda c: c.mean()) + s.across(s.numeric() & ~s.cols("year"), lambda c: c.mean()) ), ], ids=["deferred", "func"], @@ -325,7 +332,7 @@ def test_across_str(penguins): def test_if_all(penguins): - expr = penguins.filter(s.if_all(s.numeric() & ~s.c("year"), _ > 5)) + expr = penguins.filter(s.if_all(s.numeric() & ~s.cols("year"), _ > 5)) expected = penguins.filter( (_.bill_length_mm > 5) & (_.bill_depth_mm > 5) @@ -336,7 +343,7 @@ def test_if_all(penguins): def test_if_any(penguins): - expr = penguins.filter(s.if_any(s.numeric() & ~s.c("year"), _ > 5)) + expr = penguins.filter(s.if_any(s.numeric() & ~s.cols("year"), _ > 5)) expected = penguins.filter( (_.bill_length_mm > 5) | (_.bill_depth_mm > 5) @@ -346,24 +353,24 @@ def test_if_any(penguins): assert expr.equals(expected) -def test_negate_range(penguins): - assert penguins.select(~s.r[3:]).equals(penguins[[0, 1, 2]]) +def test_index_negate(penguins): + assert penguins.select(~s.index[3:]).equals(penguins[[0, 1, 2]]) -def test_string_range_start(penguins): - assert penguins.select(s.r["island":5]).equals( +def test_index_slice_string_start(penguins): + assert penguins.select(s.index["island":5]).equals( penguins.select(penguins.columns[penguins.columns.index("island") : 5]) ) -def test_string_range_end(penguins): - assert penguins.select(s.r[:"island"]).equals( +def test_index_slice_string_end(penguins): + assert penguins.select(s.index[:"island"]).equals( penguins.select(penguins.columns[: penguins.columns.index("island") + 1]) ) -def test_string_element(penguins): - assert penguins.select(~s.r["island"]).equals( +def test_index_string(penguins): + assert penguins.select(~s.index["island"]).equals( penguins.select([c for c in penguins.columns if c != "island"]) ) @@ -383,10 +390,12 @@ def test_all(penguins): @pytest.mark.parametrize( ("seq", "expected"), [ - param(~s.r[[3, 4, 5]], sorted(set(range(8)) - {3, 4, 5}), id="neg_int_list"), - param(~s.r[3, 4, 5], sorted(set(range(8)) - {3, 4, 5}), id="neg_int_tuple"), - param(s.r["island", "year"], ("island", "year"), id="string_tuple"), - param(s.r[["island", "year"]], ("island", "year"), id="string_list"), + param( + ~s.index[[3, 4, 5]], sorted(set(range(8)) - {3, 4, 5}), id="neg_int_list" + ), + param(~s.index[3, 4, 5], sorted(set(range(8)) - {3, 4, 5}), id="neg_int_tuple"), + param(s.index["island", "year"], ("island", "year"), id="string_tuple"), + param(s.index[["island", "year"]], ("island", "year"), id="string_list"), param(iter(["island", "year"]), ("island", "year"), id="mixed_iterable"), ], ) @@ -397,7 +406,7 @@ def test_sequence(penguins, seq, expected): def test_names_callable(penguins): expr = penguins.select( s.across( - s.numeric() & ~s.c("year"), + s.numeric() & ~s.cols("year"), func=dict(cast=_.cast("float32")), names=lambda col, fn: f"{fn}({col})", ) @@ -416,7 +425,7 @@ def test_names_callable(penguins): def test_names_format_string(penguins): expr = penguins.select( s.across( - s.numeric() & ~s.c("year"), + s.numeric() & ~s.cols("year"), func=dict(cast=_.cast("float32")), names="{fn}({col})", ) @@ -433,7 +442,7 @@ def test_names_format_string(penguins): def test_all_of(penguins): - expr = penguins.select(s.all_of(s.numeric(), ~s.c("year"))) + expr = penguins.select(s.all_of(s.numeric(), ~s.cols("year"))) expected = penguins.select( "bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g" ) @@ -448,7 +457,7 @@ def test_all_of_string_list(penguins): def test_any_of(penguins): - expr = penguins.select(s.any_of(s.startswith("bill"), s.c("year"))) + expr = penguins.select(s.any_of(s.startswith("bill"), s.cols("year"))) expected = penguins.select("bill_length_mm", "bill_depth_mm", "year") assert expr.equals(expected) @@ -461,22 +470,22 @@ def test_any_of_string_list(penguins): assert expr.equals(expected) -def test_c_error_on_misspelled_column(penguins): +def test_cols_error_on_misspelled_column(penguins): match = "Columns .+ are not present" - sel = s.c("inland") + sel = s.cols("inland") with pytest.raises(exc.IbisInputError, match=match): penguins.select(sel) - sel = s.any_of(s.c("inland"), s.c("island")) + sel = s.any_of(s.cols("inland"), s.cols("island")) with pytest.raises(exc.IbisInputError, match=match): penguins.select(sel) - sel = s.any_of(s.c("island"), s.c("inland")) + sel = s.any_of(s.cols("island"), s.cols("inland")) with pytest.raises(exc.IbisInputError, match=match): penguins.select(sel) - sel = s.any_of(s.c("island", "inland")) + sel = s.any_of(s.cols("island", "inland")) with pytest.raises(exc.IbisInputError, match=match): penguins.select(sel) @@ -497,19 +506,19 @@ def test_order_by_with_selectors(penguins): def test_window_function_group_by(penguins): - expr = penguins.species.count().over(group_by=s.c("island")) + expr = penguins.species.count().over(group_by=s.cols("island")) assert expr.equals(penguins.species.count().over(group_by=penguins.island)) def test_window_function_order_by(penguins): - expr = penguins.island.count().over(order_by=s.c("species")) + expr = penguins.island.count().over(order_by=s.cols("species")) assert expr.equals(penguins.island.count().over(order_by=penguins.species)) def test_window_function_group_by_order_by(penguins): expr = penguins.species.count().over( - group_by=s.c("island"), - order_by=s.c("year") | (~s.c("island", "species") & s.of_type("str")), + group_by=s.cols("island"), + order_by=s.cols("year") | (~s.cols("island", "species") & s.of_type("str")), ) assert expr.equals( penguins.species.count().over( @@ -531,22 +540,22 @@ def test_methods(penguins): assert [col.get_name() for col in bound] == penguins.columns -@pytest.mark.parametrize("sel", [s.none(), s.c(), []]) +@pytest.mark.parametrize("sel", [s.none(), s.cols(), []]) def test_none_selector(penguins, sel): sel = s._to_selector(sel) assert not sel.expand(penguins) assert not sel.expand_names(penguins) - assert list((sel | s.c("year")).expand_names(penguins)) == ["year"] + assert list((sel | s.cols("year")).expand_names(penguins)) == ["year"] with pytest.raises(exc.IbisError): penguins.select(sel) with pytest.raises(exc.IbisError): - penguins.select(sel & s.c("year")) + penguins.select(sel & s.cols("year")) - assert penguins.select(sel | s.c("year")).equals(penguins.select("year")) + assert penguins.select(sel | s.cols("year")).equals(penguins.select("year")) def test_invalid_composition(): diff --git a/ibis/tests/expr/test_table.py b/ibis/tests/expr/test_table.py index 6d802403dca9..50fb132011f0 100644 --- a/ibis/tests/expr/test_table.py +++ b/ibis/tests/expr/test_table.py @@ -1300,7 +1300,7 @@ def test_join_key_invalid(con): t1.inner_join(t2, [("foo_id", "foo_id", "foo_id")]) # it is working now - t1.inner_join(t2, [(s.c("foo_id"), s.c("foo_id"))]) + t1.inner_join(t2, [(s.cols("foo_id"), s.cols("foo_id"))]) def test_join_invalid_refs(con): @@ -1908,7 +1908,7 @@ def test_pivot_longer(): }, name="diamonds", ) - res = diamonds.pivot_longer(s.c("x", "y", "z"), names_to="pos", values_to="xyz") + res = diamonds.pivot_longer(s.cols("x", "y", "z"), names_to="pos", values_to="xyz") assert res.schema().names == ( "carat", "cut",