diff --git a/docs/_freeze/how-to/extending/builtin/execute-results/html.json b/docs/_freeze/how-to/extending/builtin/execute-results/html.json index 36722b951080..bc4ea785f3d5 100644 --- a/docs/_freeze/how-to/extending/builtin/execute-results/html.json +++ b/docs/_freeze/how-to/extending/builtin/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "be8b66093ec8d56b7575bd43ecdab2e9", + "hash": "8e4998b46079b45335d9d6385d97243c", "result": { - "markdown": "---\nfreeze: auto\ntitle: Reference built-in functions\n---\n\n\n\n\n\nFunctions that aren't exposed in ibis directly can be accessed using the\n`@ibis.udf.scalar.builtin` decorator.\n\n::: {.callout-tip}\n## [Ibis APIs](../../reference/index.qmd) may already exist for your function.\n\nBuiltin scalar UDFs are designed to be an escape hatch when Ibis doesn't have\na defined API for a built-in database function.\n\nSee [the reference documentation](../../reference/index.qmd) for existing APIs.\n:::\n\n## DuckDB\n\nIbis doesn't directly expose many of the DuckDB [text similarity\nfunctions](https://duckdb.org/docs/sql/functions/char.html#text-similarity-functions).\nLet's expose the `mismatches` API.\n\n\n::: {#a0ce6764 .cell execution_count=1}\n``` {.python .cell-code}\nfrom ibis import udf\n\n@udf.scalar.builtin\ndef mismatches(left: str, right: str) -> int:\n ...\n```\n:::\n\n\nThe [`...`](https://docs.python.org/3/library/constants.html#Ellipsis) is\na visual indicator that the function definition is unknown to Ibis.\n\n::: {.callout-note collapse=\"true\"}\n## Ibis does not do anything with the function body.\n\nIbis will not inspect the function body or otherwise inspect it. Any code you\nwrite in the function body **will be ignored**.\n:::\n\nWe can now call this function on any ibis expression:\n\n::: {#271b9916 .cell execution_count=2}\n``` {.python .cell-code}\nimport ibis\n\ncon = ibis.duckdb.connect() # <1>\n```\n:::\n\n\n1. Connect to an in-memory DuckDB database\n\n::: {#ef527d30 .cell execution_count=3}\n``` {.python .cell-code}\nexpr = mismatches(\"duck\", \"luck\")\ncon.execute(expr)\n```\n\n::: {.cell-output .cell-output-display execution_count=17}\n```\n1\n```\n:::\n:::\n\n\nLike any other ibis expression you can inspect the SQL:\n\n::: {#69b10261 .cell execution_count=4}\n``` {.python .cell-code}\nimport ibis\n\nibis.to_sql(expr, dialect=\"duckdb\") # <1>\n```\n\n::: {.cell-output .cell-output-display execution_count=18}\n```sql\nSELECT\n MISMATCHES('duck', 'luck') AS \"mismatches('duck', 'luck')\"\n```\n:::\n:::\n\n\n1. The `dialect` keyword argument must be passed, because we constructed\n a literal expression which has no backend attached.\n\nBecause built-in UDFs are ultimately Ibis expressions, they compose with the\nrest of the library:\n\n::: {#d44d4edc .cell execution_count=5}\n``` {.python .cell-code}\nibis.options.interactive = True\n\n@udf.scalar.builtin\ndef jaro_winkler_similarity(a: str, b: str) -> float:\n ...\n\npkgs = ibis.read_parquet(\n \"https://storage.googleapis.com/ibis-tutorial-data/pypi/packages.parquet\"\n)\npandas_ish = pkgs[jaro_winkler_similarity(pkgs.name, \"pandas\") >= 0.9]\npandas_ish\n```\n\n::: {.cell-output .cell-output-display execution_count=19}\n```{=html}\n
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┓\n┃ name      version            requires_python  yanked   has_binary_wheel  has_vulnerabilities  first_uploaded_at    last_uploaded_at     recorded_at          downloads  scorecard_overall  in_google_assured_oss ┃\n┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━┩\n│ stringstringstringbooleanbooleanbooleantimestamptimestamptimestampint32float64boolean               │\n├──────────┼───────────────────┼─────────────────┼─────────┼──────────────────┼─────────────────────┼─────────────────────┼─────────────────────┼─────────────────────┼───────────┼───────────────────┼───────────────────────┤\n│ bcpandas2.4.1            >=3.8.1         │ True    │ False            │ False               │ 2023-07-12 06:14:222023-07-12 06:14:232023-07-12 14:31:410nan │ False                 │\n│ espandas1.0.4            ~               │ False   │ False            │ False               │ 2018-12-22 20:52:302018-12-22 20:52:302023-07-12 14:58:4703.6 │ False                 │\n│ fpandas 0.5              ~               │ False   │ False            │ False               │ 2020-03-09 02:35:312020-03-09 02:35:312023-07-12 15:04:230nan │ False                 │\n│ h3pandas0.2.4            >=3.6           │ False   │ False            │ False               │ 2023-03-19 17:58:162023-03-19 17:58:162023-07-12 15:10:060nan │ False                 │\n│ ipandas 0.0.1            ~               │ False   │ False            │ False               │ 2019-05-29 18:46:122019-05-29 18:46:122023-07-12 15:15:3403.6 │ False                 │\n│ kpandas 0.0.1            >=3.6,<4.0      │ False   │ False            │ False               │ 2019-05-02 18:00:292019-05-02 18:00:312023-07-12 15:20:210nan │ False                 │\n│ mpandas 0.0.2.1          ~               │ False   │ False            │ False               │ 2022-07-03 16:21:212022-07-03 16:21:232023-07-12 15:30:350nan │ False                 │\n│ mtpandas1.14.202306141807>=3.6           │ False   │ False            │ False               │ 2023-06-14 18:08:012023-06-14 18:08:012023-07-12 15:31:0404.6 │ False                 │\n│ mypandas0.1.6            >=3.10          │ False   │ False            │ False               │ 2022-10-24 21:01:102022-10-24 21:01:122023-07-12 15:32:040nan │ False                 │\n│ paandas 0.0.3            ~               │ False   │ False            │ False               │ 2022-11-24 06:11:152022-11-24 06:11:172023-07-12 15:43:310nan │ False                 │\n│                      │\n└──────────┴───────────────────┴─────────────────┴─────────┴──────────────────┴─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┴───────────────────┴───────────────────────┘\n
\n```\n:::\n:::\n\n\nLet's count the results:\n\n::: {#a363610a .cell execution_count=6}\n``` {.python .cell-code}\npandas_ish.count()\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n
\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=20}\n\n::: {.ansi-escaped-output}\n```{=html}\n
178
\n```\n:::\n\n:::\n:::\n\n\nThere are a good number of packages that look similar to `pandas`!\n\n## Snowflake\n\nSimilarly we can expose Snowflake's\n[`jarowinkler_similarity`](https://docs.snowflake.com/en/sql-reference/functions/jarowinkler_similarity)\nfunction.\n\nLet's alias it to `jw_sim` to illustrate some more of the Ibis `udf` API:\n\n::: {#c6b88f1d .cell execution_count=7}\n``` {.python .cell-code}\n@udf.scalar.builtin(name=\"jarowinkler_similarity\") # <1>\ndef jw_sim(left: str, right: str) -> float:\n ...\n```\n:::\n\n\n1. `target` is the name of the function in the backend. This argument is\n required in this because the function name is different than the name of the\n function in ibis.\n\n\nNow let's connect to Snowflake and call our `jw_sim` function:\n\n::: {#4b0eeaa8 .cell execution_count=8}\n``` {.python .cell-code}\nimport os\n\ncon = ibis.connect(os.environ[\"SNOWFLAKE_URL\"])\n```\n:::\n\n\n::: {#2c651137 .cell execution_count=9}\n``` {.python .cell-code}\nexpr = jw_sim(\"snow\", \"shoe\")\ncon.execute(expr)\n```\n\n::: {.cell-output .cell-output-display execution_count=23}\n```\n66.0\n```\n:::\n:::\n\n\nAnd let's take a look at the SQL\n\n::: {#d2c051e5 .cell execution_count=10}\n``` {.python .cell-code}\nibis.to_sql(expr, dialect=\"snowflake\")\n```\n\n::: {.cell-output .cell-output-display execution_count=24}\n```sql\nSELECT\n JAROWINKLER_SIMILARITY('snow', 'shoe') AS \"jarowinkler_similarity('snow', 'shoe')\"\n```\n:::\n:::\n\n\n## Input types\n\nSometimes the input types of builtin functions are difficult to spell.\n\nConsider a function that computes the length of any array: the elements in the\narray can be floats, integers, strings and even other arrays. Spelling that\ntype is difficult.\n\nFortunately the `udf.scalar.builtin` decorator doesn't require you to specify\ninput types in these cases:\n\n::: {#7f171163 .cell execution_count=11}\n``` {.python .cell-code}\n@udf.scalar.builtin(name=\"array_size\")\ndef cardinality(arr) -> int:\n ...\n```\n:::\n\n\n::: {.callout-caution}\n## The return type annotation **is always required**.\n:::\n\nWe can pass arrays with different element types to our `cardinality` function:\n\n::: {#636298ba .cell execution_count=12}\n``` {.python .cell-code}\ncon.execute(cardinality([1, 2, 3]))\n```\n\n::: {.cell-output .cell-output-display execution_count=26}\n```\n3\n```\n:::\n:::\n\n\n::: {#a15e7c06 .cell execution_count=13}\n``` {.python .cell-code}\ncon.execute(cardinality([\"a\", \"b\"]))\n```\n\n::: {.cell-output .cell-output-display execution_count=27}\n```\n2\n```\n:::\n:::\n\n\nWhen you bypass input types the errors you get back are backend dependent:\n\n::: {#6c4d358c .cell execution_count=14}\n``` {.python .cell-code}\ncon.execute(cardinality(\"foo\"))\n```\n\n::: {.cell-output .cell-output-error}\n```\nProgrammingError: (snowflake.connector.errors.ProgrammingError) 001044 (42P13): SQL compilation error: error line 1 at position 7\nInvalid argument types for function 'ARRAY_SIZE': (VARCHAR(3))\n[SQL: SELECT array_size(%(param_1)s) AS \"array_size('foo')\"]\n[parameters: {'param_1': 'foo'}]\n(Background on this error at: https://sqlalche.me/e/14/f405)\n```\n:::\n:::\n\n\nHere, Snowflake is informing us that the `ARRAY_SIZE` function does not accept\nstrings as input.\n\n", + "markdown": "---\nfreeze: auto\ntitle: Reference built-in functions\n---\n\n\n\n\n\n\n## Scalar functions\n\nFunctions that aren't exposed in ibis directly can be accessed using the\n`@ibis.udf.scalar.builtin` decorator.\n\n::: {.callout-tip}\n### [Ibis APIs](../../reference/index.qmd) may already exist for your function.\n\nBuiltin scalar UDFs are designed to be an escape hatch when Ibis doesn't have\na defined API for a built-in database function.\n\nSee [the reference documentation](../../reference/index.qmd) for existing APIs.\n:::\n\n### DuckDB\n\nIbis doesn't directly expose many of the DuckDB [text similarity\nfunctions](https://duckdb.org/docs/sql/functions/char.html#text-similarity-functions).\nLet's expose the `mismatches` API.\n\n\n::: {#5ba1c78c .cell execution_count=1}\n``` {.python .cell-code}\nfrom ibis import udf\n\n@udf.scalar.builtin\ndef mismatches(left: str, right: str) -> int:\n ...\n```\n:::\n\n\nThe [`...`](https://docs.python.org/3/library/constants.html#Ellipsis) is\na visual indicator that the function definition is unknown to Ibis.\n\n::: {.callout-note collapse=\"true\"}\n### Ibis does not do anything with the function body.\n\nIbis will not execute the function body or otherwise inspect it. Any code you\nwrite in the function body **will be ignored**.\n:::\n\nWe can now call this function on any ibis expression:\n\n::: {#61fd038a .cell execution_count=2}\n``` {.python .cell-code}\nimport ibis\n\ncon = ibis.duckdb.connect() # <1>\n```\n:::\n\n\n1. Connect to an in-memory DuckDB database\n\n::: {#1648465b .cell execution_count=3}\n``` {.python .cell-code}\nexpr = mismatches(\"duck\", \"luck\")\ncon.execute(expr)\n```\n\n::: {.cell-output .cell-output-display execution_count=3}\n```\n1\n```\n:::\n:::\n\n\nLike any other ibis expression you can inspect the SQL:\n\n::: {#5a8e63ba .cell execution_count=4}\n``` {.python .cell-code}\nimport ibis\n\nibis.to_sql(expr, dialect=\"duckdb\") # <1>\n```\n\n::: {.cell-output .cell-output-display execution_count=4}\n```sql\nSELECT\n MISMATCHES('duck', 'luck') AS \"mismatches('duck', 'luck')\"\n```\n:::\n:::\n\n\n1. The `dialect` keyword argument must be passed, because we constructed\n a literal expression which has no backend attached.\n\nBecause built-in UDFs are ultimately Ibis expressions, they compose with the\nrest of the library:\n\n::: {#b7fa5ac2 .cell execution_count=5}\n``` {.python .cell-code}\nibis.options.interactive = True\n\n@udf.scalar.builtin\ndef jaro_winkler_similarity(a: str, b: str) -> float:\n ...\n\npkgs = ibis.read_parquet(\n \"https://storage.googleapis.com/ibis-tutorial-data/pypi/packages.parquet\"\n)\npandas_ish = pkgs[jaro_winkler_similarity(pkgs.name, \"pandas\") >= 0.9]\npandas_ish\n```\n\n::: {.cell-output .cell-output-display execution_count=5}\n```{=html}\n
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┓\n┃ name      version            requires_python  yanked   has_binary_wheel  has_vulnerabilities  first_uploaded_at    last_uploaded_at     recorded_at          downloads  scorecard_overall  in_google_assured_oss ┃\n┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━┩\n│ stringstringstringbooleanbooleanbooleantimestamptimestamptimestampint32float64boolean               │\n├──────────┼───────────────────┼─────────────────┼─────────┼──────────────────┼─────────────────────┼─────────────────────┼─────────────────────┼─────────────────────┼───────────┼───────────────────┼───────────────────────┤\n│ bcpandas2.4.1            >=3.8.1         │ True    │ False            │ False               │ 2023-07-12 06:14:222023-07-12 06:14:232023-07-12 14:31:410nan │ False                 │\n│ espandas1.0.4            ~               │ False   │ False            │ False               │ 2018-12-22 20:52:302018-12-22 20:52:302023-07-12 14:58:4703.6 │ False                 │\n│ fpandas 0.5              ~               │ False   │ False            │ False               │ 2020-03-09 02:35:312020-03-09 02:35:312023-07-12 15:04:230nan │ False                 │\n│ h3pandas0.2.4            >=3.6           │ False   │ False            │ False               │ 2023-03-19 17:58:162023-03-19 17:58:162023-07-12 15:10:060nan │ False                 │\n│ ipandas 0.0.1            ~               │ False   │ False            │ False               │ 2019-05-29 18:46:122019-05-29 18:46:122023-07-12 15:15:3403.6 │ False                 │\n│ kpandas 0.0.1            >=3.6,<4.0      │ False   │ False            │ False               │ 2019-05-02 18:00:292019-05-02 18:00:312023-07-12 15:20:210nan │ False                 │\n│ mpandas 0.0.2.1          ~               │ False   │ False            │ False               │ 2022-07-03 16:21:212022-07-03 16:21:232023-07-12 15:30:350nan │ False                 │\n│ mtpandas1.14.202306141807>=3.6           │ False   │ False            │ False               │ 2023-06-14 18:08:012023-06-14 18:08:012023-07-12 15:31:0404.6 │ False                 │\n│ mypandas0.1.6            >=3.10          │ False   │ False            │ False               │ 2022-10-24 21:01:102022-10-24 21:01:122023-07-12 15:32:040nan │ False                 │\n│ paandas 0.0.3            ~               │ False   │ False            │ False               │ 2022-11-24 06:11:152022-11-24 06:11:172023-07-12 15:43:310nan │ False                 │\n│                      │\n└──────────┴───────────────────┴─────────────────┴─────────┴──────────────────┴─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┴───────────────────┴───────────────────────┘\n
\n```\n:::\n:::\n\n\nLet's count the results:\n\n::: {#3d8a0b17 .cell execution_count=6}\n``` {.python .cell-code}\npandas_ish.count()\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n
\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=6}\n\n::: {.ansi-escaped-output}\n```{=html}\n
178
\n```\n:::\n\n:::\n:::\n\n\nThere are a good number of packages that look similar to `pandas`!\n\n### Snowflake\n\nSimilarly we can expose Snowflake's\n[`jarowinkler_similarity`](https://docs.snowflake.com/en/sql-reference/functions/jarowinkler_similarity)\nfunction.\n\nLet's alias it to `jw_sim` to illustrate some more of the Ibis `udf` API:\n\n::: {#6628765c .cell execution_count=7}\n``` {.python .cell-code}\n@udf.scalar.builtin(name=\"jarowinkler_similarity\") # <1>\ndef jw_sim(left: str, right: str) -> float:\n ...\n```\n:::\n\n\n1. `target` is the name of the function in the backend. This argument is\n required in this because the function name is different than the name of the\n function in ibis.\n\n\nNow let's connect to Snowflake and call our `jw_sim` function:\n\n::: {#32fda4ee .cell execution_count=8}\n``` {.python .cell-code}\nimport os\n\ncon = ibis.connect(os.environ[\"SNOWFLAKE_URL\"])\n```\n:::\n\n\n::: {#265d6145 .cell execution_count=9}\n``` {.python .cell-code}\nexpr = jw_sim(\"snow\", \"shoe\")\ncon.execute(expr)\n```\n\n::: {.cell-output .cell-output-display execution_count=9}\n```\n66.0\n```\n:::\n:::\n\n\nAnd let's take a look at the SQL\n\n::: {#301ff3f1 .cell execution_count=10}\n``` {.python .cell-code}\nibis.to_sql(expr, dialect=\"snowflake\")\n```\n\n::: {.cell-output .cell-output-display execution_count=10}\n```sql\nSELECT\n JAROWINKLER_SIMILARITY('snow', 'shoe') AS \"jw_sim('snow', 'shoe')\"\n```\n:::\n:::\n\n\n### Input types\n\nSometimes the input types of builtin functions are difficult to spell.\n\nConsider a function that computes the length of any array: the elements in the\narray can be floats, integers, strings and even other arrays. Spelling that\ntype is difficult.\n\nFortunately the `udf.scalar.builtin` decorator doesn't require you to specify\ninput types in these cases:\n\n::: {#e66eeb5f .cell execution_count=11}\n``` {.python .cell-code}\n@udf.scalar.builtin(name=\"array_size\")\ndef cardinality(arr) -> int:\n ...\n```\n:::\n\n\n::: {.callout-caution}\n## The return type annotation **is always required**.\n:::\n\nWe can pass arrays with different element types to our `cardinality` function:\n\n::: {#e356c5c3 .cell execution_count=12}\n``` {.python .cell-code}\ncon.execute(cardinality([1, 2, 3]))\n```\n\n::: {.cell-output .cell-output-display execution_count=12}\n```\n3\n```\n:::\n:::\n\n\n::: {#726cb281 .cell execution_count=13}\n``` {.python .cell-code}\ncon.execute(cardinality([\"a\", \"b\"]))\n```\n\n::: {.cell-output .cell-output-display execution_count=13}\n```\n2\n```\n:::\n:::\n\n\nWhen you bypass input types the errors you get back are backend dependent:\n\n::: {#31485da0 .cell execution_count=14}\n``` {.python .cell-code}\ncon.execute(cardinality(\"foo\"))\n```\n\n::: {.cell-output .cell-output-error}\n```\nProgrammingError: (snowflake.connector.errors.ProgrammingError) 001044 (42P13): SQL compilation error: error line 1 at position 7\nInvalid argument types for function 'ARRAY_SIZE': (VARCHAR(3))\n[SQL: SELECT array_size(%(param_1)s) AS \"cardinality('foo')\"]\n[parameters: {'param_1': 'foo'}]\n(Background on this error at: https://sqlalche.me/e/14/f405)\n```\n:::\n:::\n\n\nHere, Snowflake is informing us that the `ARRAY_SIZE` function does not accept\nstrings as input.\n\n\n## Aggregate functions\n\nAggregate functions that aren't exposed in ibis directly can be accessed using\nthe `@ibis.udf.agg.builtin` decorator.\n\n::: {.callout-tip}\n### [Ibis APIs](../../reference/index.qmd) may already exist for your function.\n\nBuiltin aggregate UDFs are designed to be an escape hatch when Ibis doesn't have\na defined API for a built-in database function.\n\nSee [the reference documentation](../../reference/index.qmd) for existing APIs.\n:::\n\nLet's the use the DuckDB backend to demonstrate how to access an aggregate\nfunction that isn't exposed in ibis:\n[`kurtosis`](https://en.wikipedia.org/wiki/Kurtosis).\n\n### DuckDB\n\nFirst, define the builtin aggregate function:\n\n::: {#2ccc8b6a .cell execution_count=15}\n``` {.python .cell-code}\n@udf.agg.builtin\ndef kurtosis(x: float) -> float: # <1>\n ...\n```\n:::\n\n\n1. Both the input and return type annotations indicate the **element** type of\n the input, not the shape (column or scalar). Aggregations can only be called\n on column expressions.\n\nOne of the powerful features of this API is that you can define your UD(A)Fs at\nany point during your analysis. You don't need to connect to the database to\ndefine your functions.\n\nLet's compute the kurtosis of the number of votes across all movies:\n\n::: {#7aa0b023 .cell execution_count=16}\n``` {.python .cell-code}\nfrom ibis import _\n\nexpr = (\n ibis.examples.imdb_title_ratings.fetch()\n .rename(\"snake_case\")\n .agg(kurt=lambda t: kurtosis(t.num_votes))\n)\nexpr\n```\n\n::: {.cell-output .cell-output-display execution_count=16}\n```{=html}\n
┏━━━━━━━━━━━━━┓\n┃ kurt        ┃\n┡━━━━━━━━━━━━━┩\n│ float64     │\n├─────────────┤\n│ 4545.349906 │\n└─────────────┘\n
\n```\n:::\n:::\n\n\nSince this is an aggregate function, it has the same capabilities as other,\nbuiltin aggregates like `sum`: it can be used in a group by as well as in\na window function expression.\n\nLet's compute kurtosis for all the different types of productions (shorts,\nmovies, TV, etc):\n\n::: {#ae226b1e .cell execution_count=17}\n``` {.python .cell-code}\nbasics = (\n ibis.examples.imdb_title_basics.fetch()\n .rename(\"snake_case\")\n .filter(_.is_adult == 0)\n)\nratings = ibis.examples.imdb_title_ratings.fetch().rename(\"snake_case\")\n\nbasics_ratings = ratings.join(basics, \"tconst\")\n\nexpr = (\n basics_ratings.group_by(\"title_type\")\n .agg(kurt=lambda t: kurtosis(t.num_votes))\n .order_by(_.kurt.desc())\n .head()\n)\nexpr\n```\n\n::: {.cell-output .cell-output-display execution_count=17}\n```{=html}\n
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ title_type    kurt        ┃\n┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├──────────────┼─────────────┤\n│ tvEpisode   8043.838209 │\n│ tvSeries    4030.938238 │\n│ short       3645.730119 │\n│ tvMiniSeries1901.614316 │\n│ tvMovie     1316.403908 │\n└──────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nSimilarly for window functions:\n\n::: {#8a9b28ba .cell execution_count=18}\n``` {.python .cell-code}\nexpr = (\n basics_ratings.mutate(\n kurt=lambda t: kurtosis(t.num_votes).over(group_by=\"title_type\")\n )\n .relocate(\"kurt\", after=\"tconst\")\n .filter(\n [\n _.original_title.lower().contains(\"godfather\"),\n _.title_type == \"movie\",\n _.genres.contains(\"Crime\") & _.genres.contains(\"Drama\"),\n ]\n )\n)\nexpr\n```\n\n::: {.cell-output .cell-output-display execution_count=18}\n```{=html}\n
┏━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n┃ tconst      kurt         average_rating  num_votes  title_type  primary_title           original_title          is_adult  start_year  end_year  runtime_minutes  genres             ┃\n┡━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n│ stringfloat64float64int64stringstringstringint64int64stringint64string             │\n├────────────┼─────────────┼────────────────┼───────────┼────────────┼────────────────────────┼────────────────────────┼──────────┼────────────┼──────────┼─────────────────┼────────────────────┤\n│ tt131303081090.3638565.27303movie     Godfather             Godfather             02022NULL157Action,Crime,Drama │\n│ tt0458027 1090.3638563.727movie     Mumbai Godfather      Mumbai Godfather      02005NULLNULLAction,Crime,Drama │\n│ tt0068646 1090.3638569.21945537movie     The Godfather         The Godfather         01972NULL175Crime,Drama        │\n│ tt0071562 1090.3638569.01321642movie     The Godfather Part II The Godfather Part II 01974NULL202Crime,Drama        │\n│ tt0074412 1090.3638565.21733movie     Disco Godfather       Disco Godfather       01979NULL98Action,Crime,Drama │\n│ tt0099674 1090.3638567.6412936movie     The Godfather Part IIIThe Godfather Part III01990NULL162Crime,Drama        │\n│ tt0250404 1090.3638566.5244movie     Godfather             Godfather             01992NULLNULLCrime,Drama        │\n└────────────┴─────────────┴────────────────┴───────────┴────────────┴────────────────────────┴────────────────────────┴──────────┴────────────┴──────────┴─────────────────┴────────────────────┘\n
\n```\n:::\n:::\n\n\n", "supporting": [ "builtin_files" ], diff --git a/docs/how-to/extending/builtin.qmd b/docs/how-to/extending/builtin.qmd index 141c2b9d16dd..a8a3a2091b87 100644 --- a/docs/how-to/extending/builtin.qmd +++ b/docs/how-to/extending/builtin.qmd @@ -4,11 +4,14 @@ freeze: auto # Reference built-in functions + +## Scalar functions + Functions that aren't exposed in ibis directly can be accessed using the `@ibis.udf.scalar.builtin` decorator. ::: {.callout-tip} -## [Ibis APIs](../../reference/index.qmd) may already exist for your function. +### [Ibis APIs](../../reference/index.qmd) may already exist for your function. Builtin scalar UDFs are designed to be an escape hatch when Ibis doesn't have a defined API for a built-in database function. @@ -16,7 +19,7 @@ a defined API for a built-in database function. See [the reference documentation](../../reference/index.qmd) for existing APIs. ::: -## DuckDB +### DuckDB Ibis doesn't directly expose many of the DuckDB [text similarity functions](https://duckdb.org/docs/sql/functions/char.html#text-similarity-functions). @@ -34,9 +37,9 @@ The [`...`](https://docs.python.org/3/library/constants.html#Ellipsis) is a visual indicator that the function definition is unknown to Ibis. ::: {.callout-note collapse="true"} -## Ibis does not do anything with the function body. +### Ibis does not do anything with the function body. -Ibis will not inspect the function body or otherwise inspect it. Any code you +Ibis will not execute the function body or otherwise inspect it. Any code you write in the function body **will be ignored**. ::: @@ -91,7 +94,7 @@ pandas_ish.count() There are a good number of packages that look similar to `pandas`! -## Snowflake +### Snowflake Similarly we can expose Snowflake's [`jarowinkler_similarity`](https://docs.snowflake.com/en/sql-reference/functions/jarowinkler_similarity) @@ -129,7 +132,7 @@ And let's take a look at the SQL ibis.to_sql(expr, dialect="snowflake") ``` -## Input types +### Input types Sometimes the input types of builtin functions are difficult to spell. @@ -169,3 +172,98 @@ con.execute(cardinality("foo")) Here, Snowflake is informing us that the `ARRAY_SIZE` function does not accept strings as input. + + +## Aggregate functions + +Aggregate functions that aren't exposed in ibis directly can be accessed using +the `@ibis.udf.agg.builtin` decorator. + +::: {.callout-tip} +### [Ibis APIs](../../reference/index.qmd) may already exist for your function. + +Builtin aggregate UDFs are designed to be an escape hatch when Ibis doesn't have +a defined API for a built-in database function. + +See [the reference documentation](../../reference/index.qmd) for existing APIs. +::: + +Let's the use the DuckDB backend to demonstrate how to access an aggregate +function that isn't exposed in ibis: +[`kurtosis`](https://en.wikipedia.org/wiki/Kurtosis). + +### DuckDB + +First, define the builtin aggregate function: + +```{python} +@udf.agg.builtin +def kurtosis(x: float) -> float: # <1> + ... +``` + +1. Both the input and return type annotations indicate the **element** type of + the input, not the shape (column or scalar). Aggregations can only be called + on column expressions. + +One of the powerful features of this API is that you can define your UD(A)Fs at +any point during your analysis. You don't need to connect to the database to +define your functions. + +Let's compute the kurtosis of the number of votes across all movies: + +```{python} +from ibis import _ + +expr = ( + ibis.examples.imdb_title_ratings.fetch() + .rename("snake_case") + .agg(kurt=lambda t: kurtosis(t.num_votes)) +) +expr +``` + +Since this is an aggregate function, it has the same capabilities as other, +builtin aggregates like `sum`: it can be used in a group by as well as in +a window function expression. + +Let's compute kurtosis for all the different types of productions (shorts, +movies, TV, etc): + +```{python} +basics = ( + ibis.examples.imdb_title_basics.fetch() + .rename("snake_case") + .filter(_.is_adult == 0) +) +ratings = ibis.examples.imdb_title_ratings.fetch().rename("snake_case") + +basics_ratings = ratings.join(basics, "tconst") + +expr = ( + basics_ratings.group_by("title_type") + .agg(kurt=lambda t: kurtosis(t.num_votes)) + .order_by(_.kurt.desc()) + .head() +) +expr +``` + +Similarly for window functions: + +```{python} +expr = ( + basics_ratings.mutate( + kurt=lambda t: kurtosis(t.num_votes).over(group_by="title_type") + ) + .relocate("kurt", after="tconst") + .filter( + [ + _.original_title.lower().contains("godfather"), + _.title_type == "movie", + _.genres.contains("Crime") & _.genres.contains("Drama"), + ] + ) +) +expr +``` diff --git a/ibis/backends/base/sql/__init__.py b/ibis/backends/base/sql/__init__.py index f294de2b39fc..dd7c8cb41059 100644 --- a/ibis/backends/base/sql/__init__.py +++ b/ibis/backends/base/sql/__init__.py @@ -255,6 +255,23 @@ def _(t, op): func = ".".join(filter(None, (op.__udf_namespace__, op.__func_name__))) return f"{func}({', '.join(map(t.translate, op.args))})" + def _gen_udaf_rule(self, op: ops.AggUDF): + from ibis import NA + + @self.add_operation(type(op)) + def _(t, op): + func = ".".join(filter(None, (op.__udf_namespace__, op.__func_name__))) + args = ", ".join( + t.translate( + ops.Where(where, arg, NA) + if (where := op.where) is not None + else arg + ) + for name, arg in zip(op.argnames, op.args) + if name != "where" + ) + return f"{func}({args})" + def _define_udf_translation_rules(self, expr): for udf_node in expr.op().find(ops.ScalarUDF): udf_node_type = type(udf_node) @@ -262,6 +279,12 @@ def _define_udf_translation_rules(self, expr): if udf_node_type not in self.compiler.translator_class._registry: self._gen_udf_rule(udf_node) + for udf_node in expr.op().find(ops.AggUDF): + udf_node_type = type(udf_node) + + if udf_node_type not in self.compiler.translator_class._registry: + self._gen_udaf_rule(udf_node) + def execute( self, expr: ir.Expr, diff --git a/ibis/backends/bigquery/tests/system/udf/test_udf_execute.py b/ibis/backends/bigquery/tests/system/udf/test_udf_execute.py index 1473003526c7..3020a02b58c4 100644 --- a/ibis/backends/bigquery/tests/system/udf/test_udf_execute.py +++ b/ibis/backends/bigquery/tests/system/udf/test_udf_execute.py @@ -187,7 +187,7 @@ def test_udf_sql(con, argument_type): param(b"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff", 80, id="eighty"), ], ) -def test_builtin(con, value, expected): +def test_builtin_scalar(con, value, expected): from ibis import udf @udf.scalar.builtin @@ -197,3 +197,24 @@ def bit_count(x: bytes) -> int: expr = bit_count(value) result = con.execute(expr) assert result == expected + + +@pytest.mark.parametrize( + ("where", "expected"), + [ + param({"where": True}, list("abcdef"), id="where-true"), + param({"where": False}, [], id="where-false"), + param({}, list("abcdef"), id="where-nothing"), + ], +) +def test_builtin_agg(con, where, expected): + from ibis import udf + + @udf.agg.builtin(name="array_concat_agg") + def concat_agg(x, where: bool = True) -> dt.Array[str]: + ... + + t = ibis.memtable({"a": [list("abc"), list("def")]}) + expr = concat_agg(t.a, **where) + result = con.execute(expr) + assert result == expected diff --git a/ibis/backends/postgres/__init__.py b/ibis/backends/postgres/__init__.py index f3ff1ab375f9..8c23fbba8893 100644 --- a/ibis/backends/postgres/__init__.py +++ b/ibis/backends/postgres/__init__.py @@ -209,15 +209,12 @@ def _get_udf_source(self, udf_node: ops.ScalarUDF): config = udf_node.__config__["kwargs"] func = udf_node.__func__ func_name = func.__name__ - schema = udf_node.__udf_namespace__ - name = udf_node.__func_name__ - ident = ".".join(filter(None, [schema, name])) return dict( - name=name, - ident=ident, + name=udf_node.__func_name__, + ident=udf_node.__full_name__, signature=", ".join( - f"{name} {self._compile_type(arg.dtype)}" - for name, arg in zip(udf_node.argnames, udf_node.args) + f"{argname} {self._compile_type(arg.dtype)}" + for argname, arg in zip(udf_node.argnames, udf_node.args) ), return_type=self._compile_type(udf_node.dtype), language=config.get("language", "plpython3u"), diff --git a/ibis/expr/operations/udf.py b/ibis/expr/operations/udf.py index 6cf7c33f9594..461ffe318ac6 100644 --- a/ibis/expr/operations/udf.py +++ b/ibis/expr/operations/udf.py @@ -1,10 +1,11 @@ from __future__ import annotations +import abc import enum import functools import inspect import typing -from typing import TYPE_CHECKING, Any, Callable, TypeVar +from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar from public import public @@ -36,6 +37,11 @@ class ScalarUDF(ops.Value): shape = rlz.shape_like("args") +@public +class AggUDF(ops.Reduction): + where: Optional[ops.Value[dt.Boolean]] = None + + def _wrap( wrapper, input_type: InputType, @@ -51,11 +57,117 @@ def _wrap( return functools.update_wrapper(wrapper(input_type, fn, *args, **kwargs), fn) -S = TypeVar("S", bound=ScalarUDF) +S = TypeVar("S", bound=ops.Value) +B = TypeVar("B", bound=ops.Value) + + +class _UDF(abc.ABC): + __slots__ = () + + @property + @abc.abstractmethod + def _base(self) -> type[B]: + """Base class of the UDF.""" + + @util.experimental + @classmethod + def builtin( + cls, + fn: Callable | None = None, + *args: Any, + name: str | None = None, + schema: str | None = None, + **kwargs: Any, + ) -> Callable: + """Construct a scalar user-defined function that is built-in to the backend. + + Parameters + ---------- + fn + The The function to wrap. + args + Configuration arguments for the UDF. + name + The name of the UDF in the backend if different from the function name. + schema + The schema in which the builtin function resides. + kwargs + Additional configuration arguments for the UDF. + + Examples + -------- + >>> import ibis + >>> @ibis.udf.scalar.builtin + ... def hamming(a: str, b: str) -> int: + ... '''Compute the Hamming distance between two strings.''' + >>> expr = hamming("duck", "luck") + >>> con = ibis.connect("duckdb://") + >>> con.execute(expr) + 1 + """ + return _wrap( + cls._make_wrapper, + InputType.BUILTIN, + fn, + *args, + name=name, + schema=schema, + **kwargs, + ) + + @classmethod + def _make_node( + cls, + fn: Callable, + input_type: InputType, + *args, + name: str | None = None, + schema: str | None = None, + **kwargs, + ) -> type[S]: + """Construct a scalar user-defined function that is built-in to the backend.""" + + annotations = typing.get_type_hints(fn) + if (return_annotation := annotations.pop("return", None)) is None: + raise exc.MissingReturnAnnotationError(fn) + + func_name = name if name is not None else fn.__name__ + + fields = { + arg_name: Argument( + pattern=rlz.ValueOf(annotations.get(arg_name)), default=param.default + ) + for arg_name, param in inspect.signature(fn).parameters.items() + } | { + "dtype": dt.dtype(return_annotation), + "__input_type__": input_type, + # must wrap `fn` in a `property` otherwise `fn` is assumed to be a + # method + "__func__": property(fget=lambda _, fn=fn: fn), + "__config__": FrozenDict(args=args, kwargs=FrozenDict(**kwargs)), + "__udf_namespace__": schema, + "__module__": fn.__module__, + "__func_name__": func_name, + "__full_name__": ".".join(filter(None, (schema, func_name))), + } + + return type(fn.__name__, (cls._base,), fields) + + @classmethod + def _make_wrapper( + cls, input_type: InputType, fn: Callable, *args: Any, **kwargs: Any + ) -> Callable: + node = cls._make_node(fn, input_type, *args, **kwargs) + + @functools.wraps(fn) + def construct(*args: Any, **kwargs: Any) -> ir.Value: + return node(*args, **kwargs).to_expr() + + return construct @public -class scalar: +class scalar(_UDF): """Scalar user-defined functions. ::: {.callout-note} @@ -63,9 +175,12 @@ class scalar: ::: """ + _base = ScalarUDF + @util.experimental - @staticmethod + @classmethod def python( + cls, fn: Callable | None = None, *args: Any, name: str | None = None, @@ -117,7 +232,7 @@ def python( - [`pyarrow`](./scalar-udfs.qmd#ibis.expr.operations.scalar.pyarrow) """ return _wrap( - scalar._make_wrapper, + cls._make_wrapper, InputType.PYTHON, fn, *args, @@ -127,8 +242,9 @@ def python( ) @util.experimental - @staticmethod + @classmethod def pandas( + cls, fn: Callable | None = None, *args: Any, name: str | None = None, @@ -169,7 +285,7 @@ def pandas( - [`pyarrow`](./scalar-udfs.qmd#ibis.expr.operations.scalar.pyarrow) """ return _wrap( - scalar._make_wrapper, + cls._make_wrapper, InputType.PANDAS, fn, *args, @@ -179,8 +295,9 @@ def pandas( ) @util.experimental - @staticmethod + @classmethod def pyarrow( + cls, fn: Callable | None = None, *args: Any, name: str | None = None, @@ -220,7 +337,7 @@ def pyarrow( - [`pandas`](./scalar-udfs.qmd#ibis.expr.operations.scalar.pandas) """ return _wrap( - scalar._make_wrapper, + cls._make_wrapper, InputType.PYARROW, fn, *args, @@ -229,16 +346,16 @@ def pyarrow( **kwargs, ) + +class agg(_UDF): + __slots__ = () + + _base = AggUDF + @util.experimental - @staticmethod - def builtin( - fn: Callable | None = None, - *args: Any, - name: str | None = None, - schema: str | None = None, - **kwargs: Any, - ) -> Callable: - """Construct a scalar user-defined function that is built-in to the backend. + @classmethod + def builtin(cls, *args: Any, **kwargs: Any) -> Callable: + """Construct an aggregate user-defined function that is built-in to the backend. Parameters ---------- @@ -256,68 +373,13 @@ def builtin( Examples -------- >>> import ibis - >>> @ibis.udf.scalar.builtin - ... def hamming(a: str, b: str) -> int: - ... '''Compute the Hamming distance between two strings.''' - >>> expr = hamming("duck", "luck") - >>> con = ibis.connect("duckdb://") - >>> con.execute(expr) - 1 + >>> ibis.options.interactive = True + >>> @ibis.udf.agg.builtin + ... def favg(a: float) -> float: + ... '''Compute the average of a column using Kahan summation.''' + >>> t = ibis.examples.penguins.fetch() + >>> expr = favg(t.bill_length_mm) + >>> expr + 43.9219298245614 """ - return _wrap( - scalar._make_wrapper, - InputType.BUILTIN, - fn, - *args, - name=name, - schema=schema, - **kwargs, - ) - - @staticmethod - def _make_node( - fn: Callable, - input_type: InputType, - *args, - name: str | None = None, - schema: str | None = None, - **kwargs, - ) -> type[S]: - """Construct a scalar user-defined function that is built-in to the backend.""" - - annotations = typing.get_type_hints(fn) - if (return_annotation := annotations.pop("return", None)) is None: - raise exc.MissingReturnAnnotationError(fn) - - func_name = name if name is not None else fn.__name__ - - fields = { - arg_name: Argument( - pattern=rlz.ValueOf(annotations.get(arg_name)), default=param.default - ) - for arg_name, param in inspect.signature(fn).parameters.items() - } | { - "dtype": dt.dtype(return_annotation), - "__input_type__": input_type, - # must wrap `fn` in a `property` otherwise `fn` is assumed to be a - # method - "__func__": property(fget=lambda _, fn=fn: fn), - "__config__": FrozenDict(args=args, kwargs=FrozenDict(**kwargs)), - "__udf_namespace__": schema, - "__module__": fn.__module__, - "__func_name__": func_name, - } - - return type(func_name, (ScalarUDF,), fields) - - @staticmethod - def _make_wrapper( - input_type: InputType, fn: Callable, *args: Any, **kwargs: Any - ) -> Callable: - node = scalar._make_node(fn, input_type, *args, **kwargs) - - @functools.wraps(fn) - def construct(*args: Any, **kwargs: Any) -> ir.Value: - return node(*args, **kwargs).to_expr() - - return construct + return super().builtin(*args, **kwargs)