From b5d7350f5ddd7c62a7f2ea414e33edd7373435ad Mon Sep 17 00:00:00 2001 From: Cody Date: Tue, 16 Jan 2024 21:25:51 -0500 Subject: [PATCH 01/13] docs: blog for the 1 billion row challenge --- docs/posts/1brc/.gitignore | 1 + docs/posts/1brc/index.qmd | 73 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 docs/posts/1brc/.gitignore create mode 100644 docs/posts/1brc/index.qmd diff --git a/docs/posts/1brc/.gitignore b/docs/posts/1brc/.gitignore new file mode 100644 index 000000000000..76c99377eda0 --- /dev/null +++ b/docs/posts/1brc/.gitignore @@ -0,0 +1 @@ +1brc \ No newline at end of file diff --git a/docs/posts/1brc/index.qmd b/docs/posts/1brc/index.qmd new file mode 100644 index 000000000000..f6a59df16626 --- /dev/null +++ b/docs/posts/1brc/index.qmd @@ -0,0 +1,73 @@ +--- +title: "1 billion row challenge with Ibis and DuckDB" +author: "" +date: "2024-01-40" +categories: + - blog + - duckdb +--- + +## Overview + +https://github.com/gunnarmorling/1brc + +```{.bash} +gh repo clone gunnarmorling/1brc +``` + +```{.bash} +cd 1brc/src/main/python +python create_measurements.py 1_000_000_000 +``` + +```{python} +import ibis + +ibis.set_backend("polars") + +ibis.options.interactive = True +``` + +```{python} +if ibis.get_backend().name == "duckdb": + t = ibis.read_csv( + "1brc/data/measurements.txt", + delim=";", + header=False, + columns={"station": "VARCHAR", "temperature": "DOUBLE"}, + ) +elif ibis.get_backend().name == "polars": + t = ibis.read_csv( + "1brc/data/measurements.txt", + separator=";", + has_header=False, + new_columns=["station", "temperature"], + ) +t +``` + +```{python} +t +``` + +```{python} +f"{t.count().to_pandas():,}" +``` + +```{python} +import time + +t1 = time.time() +res = ( + t.group_by(ibis._.station) + .agg( + min_temp=ibis._.temperature.min(), + mean_temp=ibis._.temperature.mean(), + max_temp=ibis._.temperature.max(), + ) + .order_by(ibis._.station.desc()) +) +print(res) +t2 = time.time() +t2 - t1 +``` \ No newline at end of file From fa812b2816e02d9fb2d558c80a46dafb36d4eee4 Mon Sep 17 00:00:00 2001 From: Cody Date: Wed, 17 Jan 2024 08:51:54 -0500 Subject: [PATCH 02/13] add schema to polars, didn't help --- docs/posts/1brc/index.qmd | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/posts/1brc/index.qmd b/docs/posts/1brc/index.qmd index f6a59df16626..a9a02cf58b18 100644 --- a/docs/posts/1brc/index.qmd +++ b/docs/posts/1brc/index.qmd @@ -22,9 +22,10 @@ python create_measurements.py 1_000_000_000 ```{python} import ibis +import polars as pl ibis.set_backend("polars") - +#ibis.set_backend("duckdb") ibis.options.interactive = True ``` @@ -42,14 +43,11 @@ elif ibis.get_backend().name == "polars": separator=";", has_header=False, new_columns=["station", "temperature"], + schema={"station": pl.datatypes.Utf8, "temperature": pl.datatypes.Float64}, ) t ``` -```{python} -t -``` - ```{python} f"{t.count().to_pandas():,}" ``` From 39eb6a652356a8241d730006b9062822d859c967 Mon Sep 17 00:00:00 2001 From: Cody Date: Wed, 17 Jan 2024 11:20:44 -0500 Subject: [PATCH 03/13] save for now --- docs/posts/1brc/index.qmd | 78 +++++++++++++++++++++++++++++---------- 1 file changed, 59 insertions(+), 19 deletions(-) diff --git a/docs/posts/1brc/index.qmd b/docs/posts/1brc/index.qmd index a9a02cf58b18..cc987394252f 100644 --- a/docs/posts/1brc/index.qmd +++ b/docs/posts/1brc/index.qmd @@ -9,6 +9,8 @@ categories: ## Overview +This is a redux of [The One Billion Row Challenge](https://www.morling.dev/blog/one-billion-row-challenge/), + https://github.com/gunnarmorling/1brc ```{.bash} @@ -20,35 +22,73 @@ cd 1brc/src/main/python python create_measurements.py 1_000_000_000 ``` -```{python} +```{.python} import ibis import polars as pl +import pyarrow as pa -ibis.set_backend("polars") +#ibis.set_backend("polars") #ibis.set_backend("duckdb") +ibis.set_backend("datafusion") ibis.options.interactive = True ``` -```{python} -if ibis.get_backend().name == "duckdb": - t = ibis.read_csv( - "1brc/data/measurements.txt", - delim=";", - header=False, - columns={"station": "VARCHAR", "temperature": "DOUBLE"}, - ) -elif ibis.get_backend().name == "polars": - t = ibis.read_csv( - "1brc/data/measurements.txt", - separator=";", - has_header=False, - new_columns=["station", "temperature"], - schema={"station": pl.datatypes.Utf8, "temperature": pl.datatypes.Float64}, - ) +```{.python} +duckdb_kwargs = { + "delim": ";", + "header": False, + "columns": {"station": "VARCHAR", "temperature": "DOUBLE"}, +} + +polars_kwargs = { + "separator": ";", + "has_header": False, + "new_columns": ["station", "temperature"], + "schema": {"station": pl.Utf8, "temperature": pl.Float64}, +} + +datafusion_kwargs = { + "delimiter": ";", + "has_header": False, + "schema": pa.schema( + [ + ( + "station", + pa.string(), + ), + ( + "temperature", + pa.float32(), + ), + ] + ), + "file_extension": ".txt", +} + +clickhouse_kwargs = { + "format": "CSV", + "types": {"station": "String", "temperature": "Float64"}, +} + +# kwargs = duckdb_kwargs if ibis.get_backend().name == "duckdb" else polars_kwargs +match ibis.get_backend().name: + case "duckdb": + kwargs = duckdb_kwargs + case "polars": + kwargs = polars_kwargs + case "datafusion": + kwargs = datafusion_kwargs + + +kwargs +``` + +```{.python} +t = ibis.read_csv("1brc/data/measurements.txt", **kwargs) t ``` -```{python} +```{.python} f"{t.count().to_pandas():,}" ``` From 468d140881a7eb53c3af71aa9430f96cb0e3b85e Mon Sep 17 00:00:00 2001 From: Cody Date: Wed, 17 Jan 2024 21:13:23 -0500 Subject: [PATCH 04/13] update blog --- .../1brc/index/execute-results/html.json | 19 ++ docs/posts/1brc/index.qmd | 261 +++++++++++++++--- 2 files changed, 247 insertions(+), 33 deletions(-) create mode 100644 docs/_freeze/posts/1brc/index/execute-results/html.json diff --git a/docs/_freeze/posts/1brc/index/execute-results/html.json b/docs/_freeze/posts/1brc/index/execute-results/html.json new file mode 100644 index 000000000000..56ed772bbc8c --- /dev/null +++ b/docs/_freeze/posts/1brc/index/execute-results/html.json @@ -0,0 +1,19 @@ +{ + "hash": "2535533709f9050aabea0ff0ff5b2649", + "result": { + "engine": "jupyter", + "markdown": "---\ntitle: \"Using one Python dataframe API to take the billion row challenge with DuckDB, Polars, and DataFusion\"\nauthor: \"Cody\"\ndate: \"2024-01-40\"\ncategories:\n - blog\n - duckdb\n - polars\n - datafusion\n---\n\n## Overview\n\nThis is an implementation of the [The One Billion Row\nChallenge](https://www.morling.dev/blog/one-billion-row-challenge/):\n\n> Let’s kick off 2024 true coder style—​I’m excited to announce the One Billion\n> Row Challenge (1BRC), running from Jan 1 until Jan 31.\n\n> Your mission, should you decide to accept it, is deceptively simple: write a\n> Java program for retrieving temperature measurement values from a text file and\n> calculating the min, mean, and max temperature per weather station. There’s just\n> one caveat: the file has 1,000,000,000 rows!\n\nI haven't written Java since dropping a computer science course my second year\nof college that forced us to do functional programming exclusively in Java.\nHowever, I'll gladly take the challenge in Python using Ibis! In fact, I did\nsomething like this (generating a billion rows with 26 columns of random numbers\nand doing basic aggregations) to test out DuckDB and Polars.\n\nIn this blog, we'll demonstrate how Ibis provides a single Python dataframe API\nto take the billion row challenge with DuckDB, Polars, and DataFusion.\n\n## Setup\n\nWe need to generate the data from the challenge. First, clone the repo:\n\n```{.bash}\ngh repo clone gunnarmorling/1brc\n```\n\nThen change into the Python directory and run the generation script with the\nnumber of rows you want to generate:\n\n```{.bash}\ncd 1brc/src/main/python\npython create_measurements.py 1_000_000_000\n```\n\nThis will generate a file called `measurements.txt` in the `data` directory at\nthe root of the repo. It is 15GB on disk:\n\n```{.bash}\n(venv) cody@voda 1brc % du 1brc/data/*\n 15G 1brc/data/measurements.txt\n808K 1brc/data/weather_stations.csv\n```\n\nAnd consists of one billion rows with two columns separated by a semicolon:\n\n```{.bash}\n(venv) cody@voda 1brc % head 1brc/data/measurements.txt \nKusugal;-67.2\nIpil;-88.6\nSohna;-31.2\nLubuagan;-2.3\nSzentes;29.2\nSylvan Lake;-70.7\nAmbato;-35.2\nBerkine;97.0\nWernau;73.4\nKennewick;-19.9\n```\n\nAlso, you'll need to install Ibis with the three backends we'll use:\n\n```{.bash}\npip install ibis-framework[duckdb,polars,datafusion]\n```\n\n## Understanding Ibis\n\nIbis provides a standard dataframe API decoupled from the execution engine. It\ncompiles Ibis expressions to a form of intermediary representation (often SQL)\nthat can be executed by different backends.\n\nThis allows us to write a single Ibis expression to complete the challenge with\nmany different execution engine backends.\n\n:::{.callout-warning}\nWhile Ibis does its best to abstract away the differences between backends, this\ncannot be done in some areas like data input and output. For example, the\n`read_csv` function across various backends (in their SQL and Python forms) have\ndifferent parameters. We'll handle that with different `kwargs` dictionaries for\nthese backends in this post.\n\nIn general, besides creating a connection and data input/output, the Ibis API is\nthe same across backends.\n:::\n\n## Completing the challenge thrice\n\nWe'll use three great options for local backends -- DuckDB, Polars, and\nDataFusion -- to complete the challenge.\n\n### Setup\n\nBefore we get started, we'll make some imports, turn on interactive mode, and\ndefine the `kwargs` dictionariy for the backends corresponding to their\n`read_csv` function:\n\n::: {#4dcdf939 .cell execution_count=1}\n``` {.python .cell-code}\nimport ibis\nimport polars as pl\nimport pyarrow as pa\n\nibis.options.interactive = True\n\nduckdb_kwargs = {\n \"delim\": \";\",\n \"header\": False,\n \"columns\": {\"station\": \"VARCHAR\", \"temperature\": \"DOUBLE\"},\n}\n\npolars_kwargs = {\n \"separator\": \";\",\n \"has_header\": False,\n \"new_columns\": [\"station\", \"temperature\"],\n \"schema\": {\"station\": pl.Utf8, \"temperature\": pl.Float64},\n}\n\ndatafusion_kwargs = {\n \"delimiter\": \";\",\n \"has_header\": False,\n \"schema\": pa.schema(\n [\n (\n \"station\",\n pa.string(),\n ),\n (\n \"temperature\",\n pa.float64(),\n ),\n ]\n ),\n \"file_extension\": \".txt\",\n}\n```\n:::\n\n\n### Completing the challenge\n\nLet's complete the challenge with each backend.\n\n::: {.panel-tabset}\n\n## DuckDb\n\nFirst let's set the backend to DuckDB (redundantly since it's the default) and\nthe `kwargs` dictionary:\n\n::: {#5fd5ce08 .cell execution_count=2}\n``` {.python .cell-code}\nibis.set_backend(\"duckdb\") # <1>\nkwargs = duckdb_kwargs\n```\n:::\n\n\n1. Redundant given DuckDB is the default\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#031d5f6a .cell execution_count=3}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=3}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station  temperature ┃\n┡━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────┼─────────────┤\n│ Kusugal-67.2 │\n│ Ipil   -88.6 │\n│ Sohna  -31.2 │\n└─────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#6e64e36b .cell execution_count=4}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=4}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#2877a165 .cell execution_count=5}\n``` {.python .cell-code}\nres = (\n t.group_by(ibis._.station)\n .agg(\n min_temp=ibis._.temperature.min(),\n mean_temp=ibis._.temperature.mean(),\n max_temp=ibis._.temperature.max(),\n )\n .order_by(ibis._.station.desc())\n)\nres\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=5}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station             min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Hertogenbosch  -99.9-0.02164899.9 │\n│ ’Aïn el Hammam    -99.9-0.21388299.9 │\n│ ’Aïn Abid         -99.9-0.24084699.9 │\n│ ’Ali Ben Sliman   -99.90.24701299.9 │\n│ ‘Aqrah            -99.9-0.02252499.9 │\n│ ‘Ajmān            -99.90.05801899.9 │\n│ ‘Abasān al Kabīrah-99.9-0.36752999.9 │\n│ Ấp Tân Ngãi       -99.9-0.16320499.9 │\n│ Ấp Khánh Hòa      -99.90.01550999.9 │\n│ Ḩā’il             -99.9-0.35951699.9 │\n│  │\n└────────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## Polars\n\nFirst let's set the backend to Polars and the `kwargs` dictionary:\n\n::: {#64deab44 .cell execution_count=6}\n``` {.python .cell-code}\nibis.set_backend(\"polars\") # <1>\nkwargs = polars_kwargs\n```\n:::\n\n\n1. Set Polars as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#c17b9c4e .cell execution_count=7}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station  temperature ┃\n┡━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────┼─────────────┤\n│ Kusugal-67.2 │\n│ Ipil   -88.6 │\n│ Sohna  -31.2 │\n└─────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#ba7e5de7 .cell execution_count=8}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=8}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#33627c55 .cell execution_count=9}\n``` {.python .cell-code}\nres = (\n t.group_by(ibis._.station)\n .agg(\n min_temp=ibis._.temperature.min(),\n mean_temp=ibis._.temperature.mean(),\n max_temp=ibis._.temperature.max(),\n )\n .order_by(ibis._.station.desc())\n)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=9}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station             min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Hertogenbosch  -99.9-0.02164899.9 │\n│ ’Aïn el Hammam    -99.9-0.21388299.9 │\n│ ’Aïn Abid         -99.9-0.24084699.9 │\n│ ’Ali Ben Sliman   -99.90.24701299.9 │\n│ ‘Aqrah            -99.9-0.02252499.9 │\n│ ‘Ajmān            -99.90.05801899.9 │\n│ ‘Abasān al Kabīrah-99.9-0.36752999.9 │\n│ Ấp Tân Ngãi       -99.9-0.16320499.9 │\n│ Ấp Khánh Hòa      -99.90.01550999.9 │\n│ Ḩā’il             -99.9-0.35951699.9 │\n│  │\n└────────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## DataFusion\n\nFirst let's set the backend to DataFusion and the `kwargs` dictionary:\n\n::: {#73754bc0 .cell execution_count=10}\n``` {.python .cell-code}\nibis.set_backend(\"datafusion\") # <1>\nkwargs = datafusion_kwargs\n```\n:::\n\n\n1. Set DataFusion as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#4064bd17 .cell execution_count=11}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=11}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station  temperature ┃\n┡━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────┼─────────────┤\n│ Kusugal-67.2 │\n│ Ipil   -88.6 │\n│ Sohna  -31.2 │\n└─────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#270eb4ca .cell execution_count=12}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=12}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#931f2ba0 .cell execution_count=13}\n``` {.python .cell-code}\nres = (\n t.group_by(ibis._.station)\n .agg(\n min_temp=ibis._.temperature.min(),\n mean_temp=ibis._.temperature.mean(),\n max_temp=ibis._.temperature.max(),\n )\n .order_by(ibis._.station.desc())\n)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=13}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station             min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Hertogenbosch  -99.9-0.02164899.9 │\n│ ’Aïn el Hammam    -99.9-0.21388299.9 │\n│ ’Aïn Abid         -99.9-0.24084699.9 │\n│ ’Ali Ben Sliman   -99.90.24701299.9 │\n│ ‘Aqrah            -99.9-0.02252499.9 │\n│ ‘Ajmān            -99.90.05801899.9 │\n│ ‘Abasān al Kabīrah-99.9-0.36752999.9 │\n│ Ấp Tân Ngãi       -99.9-0.16320499.9 │\n│ Ấp Khánh Hòa      -99.90.01550999.9 │\n│ Ḩā’il             -99.9-0.35951699.9 │\n│  │\n└────────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n:::\n\n## Bonus: more billion row data generation\n\nWhile we're here, I'll share the code I've used in the past to generate a\nbillion rows of random data:\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nsql_str = \"\"\nsql_str += \"select\\n\"\nfor c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1))):\n sql_str += f\" random() as {c},\\n\"\nsql_str += f\"from generate_series(1, {ROWS})\"\n\nt = con.sql(sql_str)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nHappy coding!\n\n## Conclusion\n\nWhile the one billion row challenge isn't a great benchmark, it's a fun way to\ndemonstrate how Ibis provides a single Python dataframe API to take the billion\nrow challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with\nother backends!\n\n", + "supporting": [ + "index_files/figure-html" + ], + "filters": [], + "includes": { + "include-in-header": [ + "\n\n\n\n" + ], + "include-after-body": [ + "\n" + ] + } + } +} \ No newline at end of file diff --git a/docs/posts/1brc/index.qmd b/docs/posts/1brc/index.qmd index cc987394252f..26bcb007560c 100644 --- a/docs/posts/1brc/index.qmd +++ b/docs/posts/1brc/index.qmd @@ -1,39 +1,121 @@ --- -title: "1 billion row challenge with Ibis and DuckDB" -author: "" +title: "Using one Python dataframe API to take the billion row challenge with DuckDB, Polars, and DataFusion" +author: "Cody" date: "2024-01-40" categories: - blog - duckdb + - polars + - datafusion --- ## Overview -This is a redux of [The One Billion Row Challenge](https://www.morling.dev/blog/one-billion-row-challenge/), +This is an implementation of the [The One Billion Row +Challenge](https://www.morling.dev/blog/one-billion-row-challenge/): -https://github.com/gunnarmorling/1brc +> Let’s kick off 2024 true coder style—​I’m excited to announce the One Billion +> Row Challenge (1BRC), running from Jan 1 until Jan 31. + +> Your mission, should you decide to accept it, is deceptively simple: write a +> Java program for retrieving temperature measurement values from a text file and +> calculating the min, mean, and max temperature per weather station. There’s just +> one caveat: the file has 1,000,000,000 rows! + +I haven't written Java since dropping a computer science course my second year +of college that forced us to do functional programming exclusively in Java. +However, I'll gladly take the challenge in Python using Ibis! In fact, I did +something like this (generating a billion rows with 26 columns of random numbers +and doing basic aggregations) to test out DuckDB and Polars. + +In this blog, we'll demonstrate how Ibis provides a single Python dataframe API +to take the billion row challenge with DuckDB, Polars, and DataFusion. + +## Setup + +We need to generate the data from the challenge. First, clone the repo: ```{.bash} gh repo clone gunnarmorling/1brc ``` +Then change into the Python directory and run the generation script with the +number of rows you want to generate: + ```{.bash} cd 1brc/src/main/python python create_measurements.py 1_000_000_000 ``` -```{.python} +This will generate a file called `measurements.txt` in the `data` directory at +the root of the repo. It is 15GB on disk: + +```{.bash} +(venv) cody@voda 1brc % du 1brc/data/* + 15G 1brc/data/measurements.txt +808K 1brc/data/weather_stations.csv +``` + +And consists of one billion rows with two columns separated by a semicolon: + +```{.bash} +(venv) cody@voda 1brc % head 1brc/data/measurements.txt +Kusugal;-67.2 +Ipil;-88.6 +Sohna;-31.2 +Lubuagan;-2.3 +Szentes;29.2 +Sylvan Lake;-70.7 +Ambato;-35.2 +Berkine;97.0 +Wernau;73.4 +Kennewick;-19.9 +``` + +Also, you'll need to install Ibis with the three backends we'll use: + +```{.bash} +pip install ibis-framework[duckdb,polars,datafusion] +``` + +## Understanding Ibis + +Ibis provides a standard dataframe API decoupled from the execution engine. It +compiles Ibis expressions to a form of intermediary representation (often SQL) +that can be executed by different backends. + +This allows us to write a single Ibis expression to complete the challenge with +many different execution engine backends. + +:::{.callout-warning} +While Ibis does its best to abstract away the differences between backends, this +cannot be done in some areas like data input and output. For example, the +`read_csv` function across various backends (in their SQL and Python forms) have +different parameters. We'll handle that with different `kwargs` dictionaries for +these backends in this post. + +In general, besides creating a connection and data input/output, the Ibis API is +the same across backends. +::: + +## Completing the challenge thrice + +We'll use three great options for local backends -- DuckDB, Polars, and +DataFusion -- to complete the challenge. + +### Setup + +Before we get started, we'll make some imports, turn on interactive mode, and +define the `kwargs` dictionariy for the backends corresponding to their +`read_csv` function: + +```{python} import ibis import polars as pl import pyarrow as pa -#ibis.set_backend("polars") -#ibis.set_backend("duckdb") -ibis.set_backend("datafusion") ibis.options.interactive = True -``` -```{.python} duckdb_kwargs = { "delim": ";", "header": False, @@ -58,44 +140,126 @@ datafusion_kwargs = { ), ( "temperature", - pa.float32(), + pa.float64(), ), ] ), "file_extension": ".txt", } +``` -clickhouse_kwargs = { - "format": "CSV", - "types": {"station": "String", "temperature": "Float64"}, -} +### Completing the challenge -# kwargs = duckdb_kwargs if ibis.get_backend().name == "duckdb" else polars_kwargs -match ibis.get_backend().name: - case "duckdb": - kwargs = duckdb_kwargs - case "polars": - kwargs = polars_kwargs - case "datafusion": - kwargs = datafusion_kwargs +Let's complete the challenge with each backend. +::: {.panel-tabset} -kwargs +## DuckDb + +First let's set the backend to DuckDB (redundantly since it's the default) and +the `kwargs` dictionary: + +```{python} +ibis.set_backend("duckdb") # <1> +kwargs = duckdb_kwargs ``` -```{.python} +1. Redundant given DuckDB is the default + +Next, we'll read in the data and take a look at the table: + +```{python} t = ibis.read_csv("1brc/data/measurements.txt", **kwargs) -t +t.limit(3) ``` -```{.python} +Then let's confirm it's **a billion** rows: + +```{python} +f"{t.count().to_pandas():,}" +``` + +Finally, we'll compute the min, mean, and max temperature per weather station: + +```{python} +res = ( + t.group_by(ibis._.station) + .agg( + min_temp=ibis._.temperature.min(), + mean_temp=ibis._.temperature.mean(), + max_temp=ibis._.temperature.max(), + ) + .order_by(ibis._.station.desc()) +) +res +``` + +## Polars + +First let's set the backend to Polars and the `kwargs` dictionary: + +```{python} +ibis.set_backend("polars") # <1> +kwargs = polars_kwargs +``` + +1. Set Polars as the default backend used + +Next, we'll read in the data and take a look at the table: + +```{python} +t = ibis.read_csv("1brc/data/measurements.txt", **kwargs) +t.limit(3) +``` + +Then let's confirm it's **a billion** rows: + +```{python} f"{t.count().to_pandas():,}" ``` +Finally, we'll compute the min, mean, and max temperature per weather station: + +```{python} +res = ( + t.group_by(ibis._.station) + .agg( + min_temp=ibis._.temperature.min(), + mean_temp=ibis._.temperature.mean(), + max_temp=ibis._.temperature.max(), + ) + .order_by(ibis._.station.desc()) +) +res +``` + +## DataFusion + +First let's set the backend to DataFusion and the `kwargs` dictionary: + +```{python} +ibis.set_backend("datafusion") # <1> +kwargs = datafusion_kwargs +``` + +1. Set DataFusion as the default backend used + +Next, we'll read in the data and take a look at the table: + ```{python} -import time +t = ibis.read_csv("1brc/data/measurements.txt", **kwargs) +t.limit(3) +``` + +Then let's confirm it's **a billion** rows: -t1 = time.time() +```{python} +f"{t.count().to_pandas():,}" +``` + +Finally, we'll compute the min, mean, and max temperature per weather station: + +```{python} res = ( t.group_by(ibis._.station) .agg( @@ -105,7 +269,38 @@ res = ( ) .order_by(ibis._.station.desc()) ) -print(res) -t2 = time.time() -t2 - t1 -``` \ No newline at end of file +res +``` + +::: + +## Bonus: more billion row data generation + +While we're here, I'll share the code I've used in the past to generate a +billion rows of random data: + +```{.python} +import ibis + +con = ibis.connect("duckdb://data.ddb") + +ROWS = 1_000_000_000 + +sql_str = "" +sql_str += "select\n" +for c in list(map(chr, range(ord("a"), ord("z") + 1))): + sql_str += f" random() as {c},\n" +sql_str += f"from generate_series(1, {ROWS})" + +t = con.sql(sql_str) +con.create_table("billion", t, overwrite=True) +``` + +Happy coding! + +## Conclusion + +While the one billion row challenge isn't a great benchmark, it's a fun way to +demonstrate how Ibis provides a single Python dataframe API to take the billion +row challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with +other backends! From b1feec734132167de7c7a47561f6afe2f8736a3e Mon Sep 17 00:00:00 2001 From: Cody Date: Wed, 17 Jan 2024 21:18:22 -0500 Subject: [PATCH 05/13] fix whitespace --- docs/posts/1brc/.gitignore | 2 +- docs/posts/1brc/index.qmd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/posts/1brc/.gitignore b/docs/posts/1brc/.gitignore index 76c99377eda0..581b9361d26a 100644 --- a/docs/posts/1brc/.gitignore +++ b/docs/posts/1brc/.gitignore @@ -1 +1 @@ -1brc \ No newline at end of file +1brc diff --git a/docs/posts/1brc/index.qmd b/docs/posts/1brc/index.qmd index 26bcb007560c..a5ac4ed882ec 100644 --- a/docs/posts/1brc/index.qmd +++ b/docs/posts/1brc/index.qmd @@ -59,7 +59,7 @@ the root of the repo. It is 15GB on disk: And consists of one billion rows with two columns separated by a semicolon: ```{.bash} -(venv) cody@voda 1brc % head 1brc/data/measurements.txt +(venv) cody@voda 1brc % head 1brc/data/measurements.txt Kusugal;-67.2 Ipil;-88.6 Sohna;-31.2 From a5eedffa028fa79d9b4a49aa85e96fc72d851796 Mon Sep 17 00:00:00 2001 From: Cody Date: Thu, 18 Jan 2024 11:13:52 -0500 Subject: [PATCH 06/13] blog updates --- .../1brc/index/execute-results/html.json | 8 +- docs/posts/1brc/index.qmd | 90 ++++++++++++------- 2 files changed, 64 insertions(+), 34 deletions(-) diff --git a/docs/_freeze/posts/1brc/index/execute-results/html.json b/docs/_freeze/posts/1brc/index/execute-results/html.json index 56ed772bbc8c..c9dd7fb37cc7 100644 --- a/docs/_freeze/posts/1brc/index/execute-results/html.json +++ b/docs/_freeze/posts/1brc/index/execute-results/html.json @@ -1,10 +1,10 @@ { - "hash": "2535533709f9050aabea0ff0ff5b2649", + "hash": "41a0a3202206a8095fbf2043d1954053", "result": { "engine": "jupyter", - "markdown": "---\ntitle: \"Using one Python dataframe API to take the billion row challenge with DuckDB, Polars, and DataFusion\"\nauthor: \"Cody\"\ndate: \"2024-01-40\"\ncategories:\n - blog\n - duckdb\n - polars\n - datafusion\n---\n\n## Overview\n\nThis is an implementation of the [The One Billion Row\nChallenge](https://www.morling.dev/blog/one-billion-row-challenge/):\n\n> Let’s kick off 2024 true coder style—​I’m excited to announce the One Billion\n> Row Challenge (1BRC), running from Jan 1 until Jan 31.\n\n> Your mission, should you decide to accept it, is deceptively simple: write a\n> Java program for retrieving temperature measurement values from a text file and\n> calculating the min, mean, and max temperature per weather station. There’s just\n> one caveat: the file has 1,000,000,000 rows!\n\nI haven't written Java since dropping a computer science course my second year\nof college that forced us to do functional programming exclusively in Java.\nHowever, I'll gladly take the challenge in Python using Ibis! In fact, I did\nsomething like this (generating a billion rows with 26 columns of random numbers\nand doing basic aggregations) to test out DuckDB and Polars.\n\nIn this blog, we'll demonstrate how Ibis provides a single Python dataframe API\nto take the billion row challenge with DuckDB, Polars, and DataFusion.\n\n## Setup\n\nWe need to generate the data from the challenge. First, clone the repo:\n\n```{.bash}\ngh repo clone gunnarmorling/1brc\n```\n\nThen change into the Python directory and run the generation script with the\nnumber of rows you want to generate:\n\n```{.bash}\ncd 1brc/src/main/python\npython create_measurements.py 1_000_000_000\n```\n\nThis will generate a file called `measurements.txt` in the `data` directory at\nthe root of the repo. It is 15GB on disk:\n\n```{.bash}\n(venv) cody@voda 1brc % du 1brc/data/*\n 15G 1brc/data/measurements.txt\n808K 1brc/data/weather_stations.csv\n```\n\nAnd consists of one billion rows with two columns separated by a semicolon:\n\n```{.bash}\n(venv) cody@voda 1brc % head 1brc/data/measurements.txt \nKusugal;-67.2\nIpil;-88.6\nSohna;-31.2\nLubuagan;-2.3\nSzentes;29.2\nSylvan Lake;-70.7\nAmbato;-35.2\nBerkine;97.0\nWernau;73.4\nKennewick;-19.9\n```\n\nAlso, you'll need to install Ibis with the three backends we'll use:\n\n```{.bash}\npip install ibis-framework[duckdb,polars,datafusion]\n```\n\n## Understanding Ibis\n\nIbis provides a standard dataframe API decoupled from the execution engine. It\ncompiles Ibis expressions to a form of intermediary representation (often SQL)\nthat can be executed by different backends.\n\nThis allows us to write a single Ibis expression to complete the challenge with\nmany different execution engine backends.\n\n:::{.callout-warning}\nWhile Ibis does its best to abstract away the differences between backends, this\ncannot be done in some areas like data input and output. For example, the\n`read_csv` function across various backends (in their SQL and Python forms) have\ndifferent parameters. We'll handle that with different `kwargs` dictionaries for\nthese backends in this post.\n\nIn general, besides creating a connection and data input/output, the Ibis API is\nthe same across backends.\n:::\n\n## Completing the challenge thrice\n\nWe'll use three great options for local backends -- DuckDB, Polars, and\nDataFusion -- to complete the challenge.\n\n### Setup\n\nBefore we get started, we'll make some imports, turn on interactive mode, and\ndefine the `kwargs` dictionariy for the backends corresponding to their\n`read_csv` function:\n\n::: {#4dcdf939 .cell execution_count=1}\n``` {.python .cell-code}\nimport ibis\nimport polars as pl\nimport pyarrow as pa\n\nibis.options.interactive = True\n\nduckdb_kwargs = {\n \"delim\": \";\",\n \"header\": False,\n \"columns\": {\"station\": \"VARCHAR\", \"temperature\": \"DOUBLE\"},\n}\n\npolars_kwargs = {\n \"separator\": \";\",\n \"has_header\": False,\n \"new_columns\": [\"station\", \"temperature\"],\n \"schema\": {\"station\": pl.Utf8, \"temperature\": pl.Float64},\n}\n\ndatafusion_kwargs = {\n \"delimiter\": \";\",\n \"has_header\": False,\n \"schema\": pa.schema(\n [\n (\n \"station\",\n pa.string(),\n ),\n (\n \"temperature\",\n pa.float64(),\n ),\n ]\n ),\n \"file_extension\": \".txt\",\n}\n```\n:::\n\n\n### Completing the challenge\n\nLet's complete the challenge with each backend.\n\n::: {.panel-tabset}\n\n## DuckDb\n\nFirst let's set the backend to DuckDB (redundantly since it's the default) and\nthe `kwargs` dictionary:\n\n::: {#5fd5ce08 .cell execution_count=2}\n``` {.python .cell-code}\nibis.set_backend(\"duckdb\") # <1>\nkwargs = duckdb_kwargs\n```\n:::\n\n\n1. Redundant given DuckDB is the default\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#031d5f6a .cell execution_count=3}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=3}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station  temperature ┃\n┡━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────┼─────────────┤\n│ Kusugal-67.2 │\n│ Ipil   -88.6 │\n│ Sohna  -31.2 │\n└─────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#6e64e36b .cell execution_count=4}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=4}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#2877a165 .cell execution_count=5}\n``` {.python .cell-code}\nres = (\n t.group_by(ibis._.station)\n .agg(\n min_temp=ibis._.temperature.min(),\n mean_temp=ibis._.temperature.mean(),\n max_temp=ibis._.temperature.max(),\n )\n .order_by(ibis._.station.desc())\n)\nres\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=5}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station             min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Hertogenbosch  -99.9-0.02164899.9 │\n│ ’Aïn el Hammam    -99.9-0.21388299.9 │\n│ ’Aïn Abid         -99.9-0.24084699.9 │\n│ ’Ali Ben Sliman   -99.90.24701299.9 │\n│ ‘Aqrah            -99.9-0.02252499.9 │\n│ ‘Ajmān            -99.90.05801899.9 │\n│ ‘Abasān al Kabīrah-99.9-0.36752999.9 │\n│ Ấp Tân Ngãi       -99.9-0.16320499.9 │\n│ Ấp Khánh Hòa      -99.90.01550999.9 │\n│ Ḩā’il             -99.9-0.35951699.9 │\n│  │\n└────────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## Polars\n\nFirst let's set the backend to Polars and the `kwargs` dictionary:\n\n::: {#64deab44 .cell execution_count=6}\n``` {.python .cell-code}\nibis.set_backend(\"polars\") # <1>\nkwargs = polars_kwargs\n```\n:::\n\n\n1. Set Polars as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#c17b9c4e .cell execution_count=7}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station  temperature ┃\n┡━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────┼─────────────┤\n│ Kusugal-67.2 │\n│ Ipil   -88.6 │\n│ Sohna  -31.2 │\n└─────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#ba7e5de7 .cell execution_count=8}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=8}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#33627c55 .cell execution_count=9}\n``` {.python .cell-code}\nres = (\n t.group_by(ibis._.station)\n .agg(\n min_temp=ibis._.temperature.min(),\n mean_temp=ibis._.temperature.mean(),\n max_temp=ibis._.temperature.max(),\n )\n .order_by(ibis._.station.desc())\n)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=9}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station             min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Hertogenbosch  -99.9-0.02164899.9 │\n│ ’Aïn el Hammam    -99.9-0.21388299.9 │\n│ ’Aïn Abid         -99.9-0.24084699.9 │\n│ ’Ali Ben Sliman   -99.90.24701299.9 │\n│ ‘Aqrah            -99.9-0.02252499.9 │\n│ ‘Ajmān            -99.90.05801899.9 │\n│ ‘Abasān al Kabīrah-99.9-0.36752999.9 │\n│ Ấp Tân Ngãi       -99.9-0.16320499.9 │\n│ Ấp Khánh Hòa      -99.90.01550999.9 │\n│ Ḩā’il             -99.9-0.35951699.9 │\n│  │\n└────────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## DataFusion\n\nFirst let's set the backend to DataFusion and the `kwargs` dictionary:\n\n::: {#73754bc0 .cell execution_count=10}\n``` {.python .cell-code}\nibis.set_backend(\"datafusion\") # <1>\nkwargs = datafusion_kwargs\n```\n:::\n\n\n1. Set DataFusion as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#4064bd17 .cell execution_count=11}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=11}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station  temperature ┃\n┡━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────┼─────────────┤\n│ Kusugal-67.2 │\n│ Ipil   -88.6 │\n│ Sohna  -31.2 │\n└─────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#270eb4ca .cell execution_count=12}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=12}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#931f2ba0 .cell execution_count=13}\n``` {.python .cell-code}\nres = (\n t.group_by(ibis._.station)\n .agg(\n min_temp=ibis._.temperature.min(),\n mean_temp=ibis._.temperature.mean(),\n max_temp=ibis._.temperature.max(),\n )\n .order_by(ibis._.station.desc())\n)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=13}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station             min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Hertogenbosch  -99.9-0.02164899.9 │\n│ ’Aïn el Hammam    -99.9-0.21388299.9 │\n│ ’Aïn Abid         -99.9-0.24084699.9 │\n│ ’Ali Ben Sliman   -99.90.24701299.9 │\n│ ‘Aqrah            -99.9-0.02252499.9 │\n│ ‘Ajmān            -99.90.05801899.9 │\n│ ‘Abasān al Kabīrah-99.9-0.36752999.9 │\n│ Ấp Tân Ngãi       -99.9-0.16320499.9 │\n│ Ấp Khánh Hòa      -99.90.01550999.9 │\n│ Ḩā’il             -99.9-0.35951699.9 │\n│  │\n└────────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n:::\n\n## Bonus: more billion row data generation\n\nWhile we're here, I'll share the code I've used in the past to generate a\nbillion rows of random data:\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nsql_str = \"\"\nsql_str += \"select\\n\"\nfor c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1))):\n sql_str += f\" random() as {c},\\n\"\nsql_str += f\"from generate_series(1, {ROWS})\"\n\nt = con.sql(sql_str)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nHappy coding!\n\n## Conclusion\n\nWhile the one billion row challenge isn't a great benchmark, it's a fun way to\ndemonstrate how Ibis provides a single Python dataframe API to take the billion\nrow challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with\nother backends!\n\n", + "markdown": "---\ntitle: \"Using one Python dataframe API to take the billion row challenge with DuckDB, Polars, and DataFusion\"\nauthor: \"Cody\"\ndate: \"2024-01-22\"\ncategories:\n - blog\n - duckdb\n - polars\n - datafusion\n---\n\n## Overview\n\nThis is an implementation of the [The One Billion Row\nChallenge](https://www.morling.dev/blog/one-billion-row-challenge/):\n\n> Let’s kick off 2024 true coder style—​I’m excited to announce the One Billion\n> Row Challenge (1BRC), running from Jan 1 until Jan 31.\n\n> Your mission, should you decide to accept it, is deceptively simple: write a\n> Java program for retrieving temperature measurement values from a text file and\n> calculating the min, mean, and max temperature per weather station. There’s just\n> one caveat: the file has 1,000,000,000 rows!\n\nI haven't written Java since dropping a computer science course my second year\nof college that forced us to do functional programming exclusively in Java.\nHowever, I'll gladly take the challenge in Python using Ibis! In fact, I did\nsomething like this (generating a billion rows with 26 columns of random numbers\nand doing basic aggregations) to test out DuckDB and Polars.\n\nIn this blog, we'll demonstrate how Ibis provides a single Python dataframe API\nto take the billion row challenge with DuckDB, Polars, and DataFusion.\n\n## Setup\n\nWe need to generate the data from the challenge. First, clone the repo:\n\n```{.bash}\ngh repo clone gunnarmorling/1brc\n```\n\nThen change into the Python directory and run the generation script with the\nnumber of rows you want to generate:\n\n```{.bash}\ncd 1brc/src/main/python\npython create_measurements.py 1_000_000_000\n```\n\nThis will generate a file called `measurements.txt` in the `data` directory at\nthe root of the repo. It is 15GB on disk:\n\n```{.bash}\n(venv) cody@voda 1brc % du 1brc/data/*\n 15G 1brc/data/measurements.txt\n808K 1brc/data/weather_stations.csv\n```\n\nAnd consists of one billion rows with two columns separated by a semicolon:\n\n```{.bash}\n(venv) cody@voda 1brc % head 1brc/data/measurements.txt\nKusugal;-67.2\nIpil;-88.6\nSohna;-31.2\nLubuagan;-2.3\nSzentes;29.2\nSylvan Lake;-70.7\nAmbato;-35.2\nBerkine;97.0\nWernau;73.4\nKennewick;-19.9\n```\n\nAlso, you'll need to install Ibis with the three backends we'll use:\n\n```{.bash}\npip install ibis-framework[duckdb,polars,datafusion]\n```\n\n## Understanding Ibis\n\nIbis provides a standard dataframe API decoupled from the execution engine. It\ncompiles Ibis expressions to a form of intermediary representation (often SQL)\nthat can be executed by different backends.\n\nThis allows us to write a single Ibis expression to complete the challenge with\nmany different execution engine backends.\n\n:::{.callout-warning}\nWhile Ibis does its best to abstract away the differences between backends, this\ncannot be done in some areas like data input and output. For example, the\n`read_csv` function across various backends (in their SQL and Python forms) have\ndifferent parameters. We'll handle that with different `kwargs` dictionaries for\nthese backends in this post.\n\nIn general, besides creating a connection and data input/output, the Ibis API is\nthe same across backends.\n:::\n\n## Completing the challenge thrice\n\nWe'll use three great options for local backends -- DuckDB, Polars, and\nDataFusion -- to complete the challenge.\n\n### Setup\n\nBefore we get started, we'll make some imports, turn on interactive mode, and\ndefine the `kwargs` dictionary for the backends corresponding to their\n`read_csv` function:\n\n::: {#3054f50d .cell execution_count=1}\n``` {.python .cell-code}\nimport ibis\nimport polars as pl\nimport pyarrow as pa\n\nibis.options.interactive = True\n\nduckdb_kwargs = {\n \"delim\": \";\",\n \"header\": False,\n \"columns\": {\"station\": \"VARCHAR\", \"temperature\": \"DOUBLE\"},\n}\n\npolars_kwargs = {\n \"separator\": \";\",\n \"has_header\": False,\n \"new_columns\": [\"station\", \"temperature\"],\n \"schema\": {\"station\": pl.Utf8, \"temperature\": pl.Float64},\n}\n\ndatafusion_kwargs = {\n \"delimiter\": \";\",\n \"has_header\": False,\n \"schema\": pa.schema(\n [\n (\n \"station\",\n pa.string(),\n ),\n (\n \"temperature\",\n pa.float64(),\n ),\n ]\n ),\n \"file_extension\": \".txt\",\n}\n```\n:::\n\n\nLet's define a function to run the same code with each backend to complete the challenge:\n\n::: {#482ad8a7 .cell execution_count=2}\n``` {.python .cell-code}\ndef run_challenge(t):\n res = (\n t.group_by(ibis._.station)\n .agg(\n min_temp=ibis._.temperature.min(),\n mean_temp=ibis._.temperature.mean(),\n max_temp=ibis._.temperature.max(),\n )\n .order_by(ibis._.station.desc())\n )\n return res\n```\n:::\n\n\n### Completing the challenge\n\nLet's complete the challenge with each backend.\n\n:::{.callout-note}\nThe results are the same across backends but look suspicious. It is noted in the\nrepository that the Python generation code is \"unofficial\", so may have some\nproblems. Given this is a contrived example of generated data, I'm not going to\nworry about it.\n\nThe point is that we can easily complete the challenge with the same code across\nmany backends, letting them worry about the details of execution. For this\nreason, I'm also not providing execution times. Try it out yourself!\n:::\n\n::: {.panel-tabset}\n\n## DuckDb\n\nFirst let's set the backend to DuckDB (redundantly since it's the default) and\nthe `kwargs` dictionary:\n\n::: {#5b807cac .cell execution_count=3}\n``` {.python .cell-code}\nibis.set_backend(\"duckdb\") # <1>\nkwargs = duckdb_kwargs\n```\n:::\n\n\n1. Redundant given DuckDB is the default\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#8580246b .cell execution_count=4}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=18}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station  temperature ┃\n┡━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────┼─────────────┤\n│ Kusugal-67.2 │\n│ Ipil   -88.6 │\n│ Sohna  -31.2 │\n└─────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#77b744f3 .cell execution_count=5}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=19}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#fa56fefc .cell execution_count=6}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=20}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station             min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Hertogenbosch  -99.9-0.02164899.9 │\n│ ’Aïn el Hammam    -99.9-0.21388299.9 │\n│ ’Aïn Abid         -99.9-0.24084699.9 │\n│ ’Ali Ben Sliman   -99.90.24701299.9 │\n│ ‘Aqrah            -99.9-0.02252499.9 │\n│ ‘Ajmān            -99.90.05801899.9 │\n│ ‘Abasān al Kabīrah-99.9-0.36752999.9 │\n│ Ấp Tân Ngãi       -99.9-0.16320499.9 │\n│ Ấp Khánh Hòa      -99.90.01550999.9 │\n│ Ḩā’il             -99.9-0.35951699.9 │\n│  │\n└────────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## Polars\n\nFirst let's set the backend to Polars and the `kwargs` dictionary:\n\n::: {#f704323d .cell execution_count=7}\n``` {.python .cell-code}\nibis.set_backend(\"polars\") # <1>\nkwargs = polars_kwargs\n```\n:::\n\n\n1. Set Polars as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#496119ab .cell execution_count=8}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=22}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station  temperature ┃\n┡━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────┼─────────────┤\n│ Kusugal-67.2 │\n│ Ipil   -88.6 │\n│ Sohna  -31.2 │\n└─────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#08a6d932 .cell execution_count=9}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=23}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#dba6844d .cell execution_count=10}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=24}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station             min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Hertogenbosch  -99.9-0.02164899.9 │\n│ ’Aïn el Hammam    -99.9-0.21388299.9 │\n│ ’Aïn Abid         -99.9-0.24084699.9 │\n│ ’Ali Ben Sliman   -99.90.24701299.9 │\n│ ‘Aqrah            -99.9-0.02252499.9 │\n│ ‘Ajmān            -99.90.05801899.9 │\n│ ‘Abasān al Kabīrah-99.9-0.36752999.9 │\n│ Ấp Tân Ngãi       -99.9-0.16320499.9 │\n│ Ấp Khánh Hòa      -99.90.01550999.9 │\n│ Ḩā’il             -99.9-0.35951699.9 │\n│  │\n└────────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## DataFusion\n\nFirst let's set the backend to DataFusion and the `kwargs` dictionary:\n\n::: {#d8a4956e .cell execution_count=11}\n``` {.python .cell-code}\nibis.set_backend(\"datafusion\") # <1>\nkwargs = datafusion_kwargs\n```\n:::\n\n\n1. Set DataFusion as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#7b5e2d13 .cell execution_count=12}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=26}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station  temperature ┃\n┡━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────┼─────────────┤\n│ Kusugal-67.2 │\n│ Ipil   -88.6 │\n│ Sohna  -31.2 │\n└─────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#d93e68a6 .cell execution_count=13}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=27}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#985bcbf3 .cell execution_count=14}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=28}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station             min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Hertogenbosch  -99.9-0.02164899.9 │\n│ ’Aïn el Hammam    -99.9-0.21388299.9 │\n│ ’Aïn Abid         -99.9-0.24084699.9 │\n│ ’Ali Ben Sliman   -99.90.24701299.9 │\n│ ‘Aqrah            -99.9-0.02252499.9 │\n│ ‘Ajmān            -99.90.05801899.9 │\n│ ‘Abasān al Kabīrah-99.9-0.36752999.9 │\n│ Ấp Tân Ngãi       -99.9-0.16320499.9 │\n│ Ấp Khánh Hòa      -99.90.01550999.9 │\n│ Ḩā’il             -99.9-0.35951699.9 │\n│  │\n└────────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n:::\n\n## Bonus: more billion row data generation\n\nWhile we're here, I'll share the code I've used in the past to generate a\nbillion rows of random data:\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nsql_str = \"\"\nsql_str += \"select\\n\"\nfor c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1))):\n sql_str += f\" random() as {c},\\n\"\nsql_str += f\"from generate_series(1, {ROWS})\"\n\nt = con.sql(sql_str)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nNowadays I'd convert that to an Ibis expression:\n\n:::{.callout-note}\nThis is a slightly different result with a monotonic index column, but I prefer\nit anyway. You could drop that column or adjust the expression.\n:::\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nt = (\n ibis.range(ROWS)\n .unnest()\n .name(\"index\")\n .as_table()\n .mutate(**{c: ibis.random() for c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1)))})\n)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nBut if you do need to construct a programmatic SQL string, it's cool that you\ncan!\n\n## Conclusion\n\nWhile the one billion row challenge isn't a great benchmark, it's a fun way to\ndemonstrate how Ibis provides a single Python dataframe API to take the billion\nrow challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with\nother backends!\n\nHappy coding!\n\n", "supporting": [ - "index_files/figure-html" + "index_files" ], "filters": [], "includes": { @@ -12,7 +12,7 @@ "\n\n\n\n" ], "include-after-body": [ - "\n" + "\n" ] } } diff --git a/docs/posts/1brc/index.qmd b/docs/posts/1brc/index.qmd index a5ac4ed882ec..c6368387ad8d 100644 --- a/docs/posts/1brc/index.qmd +++ b/docs/posts/1brc/index.qmd @@ -1,7 +1,7 @@ --- title: "Using one Python dataframe API to take the billion row challenge with DuckDB, Polars, and DataFusion" author: "Cody" -date: "2024-01-40" +date: "2024-01-22" categories: - blog - duckdb @@ -106,7 +106,7 @@ DataFusion -- to complete the challenge. ### Setup Before we get started, we'll make some imports, turn on interactive mode, and -define the `kwargs` dictionariy for the backends corresponding to their +define the `kwargs` dictionary for the backends corresponding to their `read_csv` function: ```{python} @@ -148,10 +148,37 @@ datafusion_kwargs = { } ``` +Let's define a function to run the same code with each backend to complete the challenge: + +```{python} +def run_challenge(t): + res = ( + t.group_by(ibis._.station) + .agg( + min_temp=ibis._.temperature.min(), + mean_temp=ibis._.temperature.mean(), + max_temp=ibis._.temperature.max(), + ) + .order_by(ibis._.station.desc()) + ) + return res +``` + ### Completing the challenge Let's complete the challenge with each backend. +:::{.callout-note} +The results are the same across backends but look suspicious. It is noted in the +repository that the Python generation code is "unofficial", so may have some +problems. Given this is a contrived example of generated data, I'm not going to +worry about it. + +The point is that we can easily complete the challenge with the same code across +many backends, letting them worry about the details of execution. For this +reason, I'm also not providing execution times. Try it out yourself! +::: + ::: {.panel-tabset} ## DuckDb @@ -182,15 +209,7 @@ f"{t.count().to_pandas():,}" Finally, we'll compute the min, mean, and max temperature per weather station: ```{python} -res = ( - t.group_by(ibis._.station) - .agg( - min_temp=ibis._.temperature.min(), - mean_temp=ibis._.temperature.mean(), - max_temp=ibis._.temperature.max(), - ) - .order_by(ibis._.station.desc()) -) +res = run_challenge(t) res ``` @@ -221,15 +240,7 @@ f"{t.count().to_pandas():,}" Finally, we'll compute the min, mean, and max temperature per weather station: ```{python} -res = ( - t.group_by(ibis._.station) - .agg( - min_temp=ibis._.temperature.min(), - mean_temp=ibis._.temperature.mean(), - max_temp=ibis._.temperature.max(), - ) - .order_by(ibis._.station.desc()) -) +res = run_challenge(t) res ``` @@ -260,15 +271,7 @@ f"{t.count().to_pandas():,}" Finally, we'll compute the min, mean, and max temperature per weather station: ```{python} -res = ( - t.group_by(ibis._.station) - .agg( - min_temp=ibis._.temperature.min(), - mean_temp=ibis._.temperature.mean(), - max_temp=ibis._.temperature.max(), - ) - .order_by(ibis._.station.desc()) -) +res = run_challenge(t) res ``` @@ -296,7 +299,32 @@ t = con.sql(sql_str) con.create_table("billion", t, overwrite=True) ``` -Happy coding! +Nowadays I'd convert that to an Ibis expression: + +:::{.callout-note} +This is a slightly different result with a monotonic index column, but I prefer +it anyway. You could drop that column or adjust the expression. +::: + +```{.python} +import ibis + +con = ibis.connect("duckdb://data.ddb") + +ROWS = 1_000_000_000 + +t = ( + ibis.range(ROWS) + .unnest() + .name("index") + .as_table() + .mutate(**{c: ibis.random() for c in list(map(chr, range(ord("a"), ord("z") + 1)))}) +) +con.create_table("billion", t, overwrite=True) +``` + +But if you do need to construct a programmatic SQL string, it's cool that you +can! ## Conclusion @@ -304,3 +332,5 @@ While the one billion row challenge isn't a great benchmark, it's a fun way to demonstrate how Ibis provides a single Python dataframe API to take the billion row challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with other backends! + +Happy coding! \ No newline at end of file From b3aaaae221ddce347ca326eeebde211db6b1854a Mon Sep 17 00:00:00 2001 From: Cody Date: Thu, 18 Jan 2024 11:51:15 -0500 Subject: [PATCH 07/13] add newline --- docs/_freeze/posts/1brc/index/execute-results/html.json | 8 ++++---- docs/posts/1brc/index.qmd | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/_freeze/posts/1brc/index/execute-results/html.json b/docs/_freeze/posts/1brc/index/execute-results/html.json index c9dd7fb37cc7..da968d9a32c1 100644 --- a/docs/_freeze/posts/1brc/index/execute-results/html.json +++ b/docs/_freeze/posts/1brc/index/execute-results/html.json @@ -1,10 +1,10 @@ { - "hash": "41a0a3202206a8095fbf2043d1954053", + "hash": "071d60e8b7e35939bff2e30c0c26fa5a", "result": { "engine": "jupyter", - "markdown": "---\ntitle: \"Using one Python dataframe API to take the billion row challenge with DuckDB, Polars, and DataFusion\"\nauthor: \"Cody\"\ndate: \"2024-01-22\"\ncategories:\n - blog\n - duckdb\n - polars\n - datafusion\n---\n\n## Overview\n\nThis is an implementation of the [The One Billion Row\nChallenge](https://www.morling.dev/blog/one-billion-row-challenge/):\n\n> Let’s kick off 2024 true coder style—​I’m excited to announce the One Billion\n> Row Challenge (1BRC), running from Jan 1 until Jan 31.\n\n> Your mission, should you decide to accept it, is deceptively simple: write a\n> Java program for retrieving temperature measurement values from a text file and\n> calculating the min, mean, and max temperature per weather station. There’s just\n> one caveat: the file has 1,000,000,000 rows!\n\nI haven't written Java since dropping a computer science course my second year\nof college that forced us to do functional programming exclusively in Java.\nHowever, I'll gladly take the challenge in Python using Ibis! In fact, I did\nsomething like this (generating a billion rows with 26 columns of random numbers\nand doing basic aggregations) to test out DuckDB and Polars.\n\nIn this blog, we'll demonstrate how Ibis provides a single Python dataframe API\nto take the billion row challenge with DuckDB, Polars, and DataFusion.\n\n## Setup\n\nWe need to generate the data from the challenge. First, clone the repo:\n\n```{.bash}\ngh repo clone gunnarmorling/1brc\n```\n\nThen change into the Python directory and run the generation script with the\nnumber of rows you want to generate:\n\n```{.bash}\ncd 1brc/src/main/python\npython create_measurements.py 1_000_000_000\n```\n\nThis will generate a file called `measurements.txt` in the `data` directory at\nthe root of the repo. It is 15GB on disk:\n\n```{.bash}\n(venv) cody@voda 1brc % du 1brc/data/*\n 15G 1brc/data/measurements.txt\n808K 1brc/data/weather_stations.csv\n```\n\nAnd consists of one billion rows with two columns separated by a semicolon:\n\n```{.bash}\n(venv) cody@voda 1brc % head 1brc/data/measurements.txt\nKusugal;-67.2\nIpil;-88.6\nSohna;-31.2\nLubuagan;-2.3\nSzentes;29.2\nSylvan Lake;-70.7\nAmbato;-35.2\nBerkine;97.0\nWernau;73.4\nKennewick;-19.9\n```\n\nAlso, you'll need to install Ibis with the three backends we'll use:\n\n```{.bash}\npip install ibis-framework[duckdb,polars,datafusion]\n```\n\n## Understanding Ibis\n\nIbis provides a standard dataframe API decoupled from the execution engine. It\ncompiles Ibis expressions to a form of intermediary representation (often SQL)\nthat can be executed by different backends.\n\nThis allows us to write a single Ibis expression to complete the challenge with\nmany different execution engine backends.\n\n:::{.callout-warning}\nWhile Ibis does its best to abstract away the differences between backends, this\ncannot be done in some areas like data input and output. For example, the\n`read_csv` function across various backends (in their SQL and Python forms) have\ndifferent parameters. We'll handle that with different `kwargs` dictionaries for\nthese backends in this post.\n\nIn general, besides creating a connection and data input/output, the Ibis API is\nthe same across backends.\n:::\n\n## Completing the challenge thrice\n\nWe'll use three great options for local backends -- DuckDB, Polars, and\nDataFusion -- to complete the challenge.\n\n### Setup\n\nBefore we get started, we'll make some imports, turn on interactive mode, and\ndefine the `kwargs` dictionary for the backends corresponding to their\n`read_csv` function:\n\n::: {#3054f50d .cell execution_count=1}\n``` {.python .cell-code}\nimport ibis\nimport polars as pl\nimport pyarrow as pa\n\nibis.options.interactive = True\n\nduckdb_kwargs = {\n \"delim\": \";\",\n \"header\": False,\n \"columns\": {\"station\": \"VARCHAR\", \"temperature\": \"DOUBLE\"},\n}\n\npolars_kwargs = {\n \"separator\": \";\",\n \"has_header\": False,\n \"new_columns\": [\"station\", \"temperature\"],\n \"schema\": {\"station\": pl.Utf8, \"temperature\": pl.Float64},\n}\n\ndatafusion_kwargs = {\n \"delimiter\": \";\",\n \"has_header\": False,\n \"schema\": pa.schema(\n [\n (\n \"station\",\n pa.string(),\n ),\n (\n \"temperature\",\n pa.float64(),\n ),\n ]\n ),\n \"file_extension\": \".txt\",\n}\n```\n:::\n\n\nLet's define a function to run the same code with each backend to complete the challenge:\n\n::: {#482ad8a7 .cell execution_count=2}\n``` {.python .cell-code}\ndef run_challenge(t):\n res = (\n t.group_by(ibis._.station)\n .agg(\n min_temp=ibis._.temperature.min(),\n mean_temp=ibis._.temperature.mean(),\n max_temp=ibis._.temperature.max(),\n )\n .order_by(ibis._.station.desc())\n )\n return res\n```\n:::\n\n\n### Completing the challenge\n\nLet's complete the challenge with each backend.\n\n:::{.callout-note}\nThe results are the same across backends but look suspicious. It is noted in the\nrepository that the Python generation code is \"unofficial\", so may have some\nproblems. Given this is a contrived example of generated data, I'm not going to\nworry about it.\n\nThe point is that we can easily complete the challenge with the same code across\nmany backends, letting them worry about the details of execution. For this\nreason, I'm also not providing execution times. Try it out yourself!\n:::\n\n::: {.panel-tabset}\n\n## DuckDb\n\nFirst let's set the backend to DuckDB (redundantly since it's the default) and\nthe `kwargs` dictionary:\n\n::: {#5b807cac .cell execution_count=3}\n``` {.python .cell-code}\nibis.set_backend(\"duckdb\") # <1>\nkwargs = duckdb_kwargs\n```\n:::\n\n\n1. Redundant given DuckDB is the default\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#8580246b .cell execution_count=4}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=18}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station  temperature ┃\n┡━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────┼─────────────┤\n│ Kusugal-67.2 │\n│ Ipil   -88.6 │\n│ Sohna  -31.2 │\n└─────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#77b744f3 .cell execution_count=5}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=19}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#fa56fefc .cell execution_count=6}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=20}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station             min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Hertogenbosch  -99.9-0.02164899.9 │\n│ ’Aïn el Hammam    -99.9-0.21388299.9 │\n│ ’Aïn Abid         -99.9-0.24084699.9 │\n│ ’Ali Ben Sliman   -99.90.24701299.9 │\n│ ‘Aqrah            -99.9-0.02252499.9 │\n│ ‘Ajmān            -99.90.05801899.9 │\n│ ‘Abasān al Kabīrah-99.9-0.36752999.9 │\n│ Ấp Tân Ngãi       -99.9-0.16320499.9 │\n│ Ấp Khánh Hòa      -99.90.01550999.9 │\n│ Ḩā’il             -99.9-0.35951699.9 │\n│  │\n└────────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## Polars\n\nFirst let's set the backend to Polars and the `kwargs` dictionary:\n\n::: {#f704323d .cell execution_count=7}\n``` {.python .cell-code}\nibis.set_backend(\"polars\") # <1>\nkwargs = polars_kwargs\n```\n:::\n\n\n1. Set Polars as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#496119ab .cell execution_count=8}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=22}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station  temperature ┃\n┡━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────┼─────────────┤\n│ Kusugal-67.2 │\n│ Ipil   -88.6 │\n│ Sohna  -31.2 │\n└─────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#08a6d932 .cell execution_count=9}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=23}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#dba6844d .cell execution_count=10}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=24}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station             min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Hertogenbosch  -99.9-0.02164899.9 │\n│ ’Aïn el Hammam    -99.9-0.21388299.9 │\n│ ’Aïn Abid         -99.9-0.24084699.9 │\n│ ’Ali Ben Sliman   -99.90.24701299.9 │\n│ ‘Aqrah            -99.9-0.02252499.9 │\n│ ‘Ajmān            -99.90.05801899.9 │\n│ ‘Abasān al Kabīrah-99.9-0.36752999.9 │\n│ Ấp Tân Ngãi       -99.9-0.16320499.9 │\n│ Ấp Khánh Hòa      -99.90.01550999.9 │\n│ Ḩā’il             -99.9-0.35951699.9 │\n│  │\n└────────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## DataFusion\n\nFirst let's set the backend to DataFusion and the `kwargs` dictionary:\n\n::: {#d8a4956e .cell execution_count=11}\n``` {.python .cell-code}\nibis.set_backend(\"datafusion\") # <1>\nkwargs = datafusion_kwargs\n```\n:::\n\n\n1. Set DataFusion as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#7b5e2d13 .cell execution_count=12}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=26}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station  temperature ┃\n┡━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────┼─────────────┤\n│ Kusugal-67.2 │\n│ Ipil   -88.6 │\n│ Sohna  -31.2 │\n└─────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#d93e68a6 .cell execution_count=13}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=27}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#985bcbf3 .cell execution_count=14}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=28}\n```{=html}\n
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station             min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Hertogenbosch  -99.9-0.02164899.9 │\n│ ’Aïn el Hammam    -99.9-0.21388299.9 │\n│ ’Aïn Abid         -99.9-0.24084699.9 │\n│ ’Ali Ben Sliman   -99.90.24701299.9 │\n│ ‘Aqrah            -99.9-0.02252499.9 │\n│ ‘Ajmān            -99.90.05801899.9 │\n│ ‘Abasān al Kabīrah-99.9-0.36752999.9 │\n│ Ấp Tân Ngãi       -99.9-0.16320499.9 │\n│ Ấp Khánh Hòa      -99.90.01550999.9 │\n│ Ḩā’il             -99.9-0.35951699.9 │\n│  │\n└────────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n:::\n\n## Bonus: more billion row data generation\n\nWhile we're here, I'll share the code I've used in the past to generate a\nbillion rows of random data:\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nsql_str = \"\"\nsql_str += \"select\\n\"\nfor c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1))):\n sql_str += f\" random() as {c},\\n\"\nsql_str += f\"from generate_series(1, {ROWS})\"\n\nt = con.sql(sql_str)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nNowadays I'd convert that to an Ibis expression:\n\n:::{.callout-note}\nThis is a slightly different result with a monotonic index column, but I prefer\nit anyway. You could drop that column or adjust the expression.\n:::\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nt = (\n ibis.range(ROWS)\n .unnest()\n .name(\"index\")\n .as_table()\n .mutate(**{c: ibis.random() for c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1)))})\n)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nBut if you do need to construct a programmatic SQL string, it's cool that you\ncan!\n\n## Conclusion\n\nWhile the one billion row challenge isn't a great benchmark, it's a fun way to\ndemonstrate how Ibis provides a single Python dataframe API to take the billion\nrow challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with\nother backends!\n\nHappy coding!\n\n", + "markdown": "---\ntitle: \"Using one Python dataframe API to take the billion row challenge with DuckDB, Polars, and DataFusion\"\nauthor: \"Cody\"\ndate: \"2024-01-22\"\ncategories:\n - blog\n - duckdb\n - polars\n - datafusion\n---\n\n## Overview\n\nThis is an implementation of the [The One Billion Row\nChallenge](https://www.morling.dev/blog/one-billion-row-challenge/):\n\n> Let’s kick off 2024 true coder style—​I’m excited to announce the One Billion\n> Row Challenge (1BRC), running from Jan 1 until Jan 31.\n\n> Your mission, should you decide to accept it, is deceptively simple: write a\n> Java program for retrieving temperature measurement values from a text file and\n> calculating the min, mean, and max temperature per weather station. There’s just\n> one caveat: the file has 1,000,000,000 rows!\n\nI haven't written Java since dropping a computer science course my second year\nof college that forced us to do functional programming exclusively in Java.\nHowever, I'll gladly take the challenge in Python using Ibis! In fact, I did\nsomething like this (generating a billion rows with 26 columns of random numbers\nand doing basic aggregations) to test out DuckDB and Polars.\n\nIn this blog, we'll demonstrate how Ibis provides a single Python dataframe API\nto take the billion row challenge with DuckDB, Polars, and DataFusion.\n\n## Setup\n\nWe need to generate the data from the challenge. First, clone the repo:\n\n```{.bash}\ngh repo clone gunnarmorling/1brc\n```\n\nThen change into the Python directory and run the generation script with the\nnumber of rows you want to generate:\n\n```{.bash}\ncd 1brc/src/main/python\npython create_measurements.py 1_000_000_000\n```\n\nThis will generate a file called `measurements.txt` in the `data` directory at\nthe root of the repo. It is 15GB on disk:\n\n```{.bash}\n(venv) cody@voda 1brc % du 1brc/data/*\n 15G 1brc/data/measurements.txt\n808K 1brc/data/weather_stations.csv\n```\n\nAnd consists of one billion rows with two columns separated by a semicolon:\n\n```{.bash}\n(venv) cody@voda 1brc % head 1brc/data/measurements.txt\nKusugal;-67.2\nIpil;-88.6\nSohna;-31.2\nLubuagan;-2.3\nSzentes;29.2\nSylvan Lake;-70.7\nAmbato;-35.2\nBerkine;97.0\nWernau;73.4\nKennewick;-19.9\n```\n\nAlso, you'll need to install Ibis with the three backends we'll use:\n\n```{.bash}\npip install ibis-framework[duckdb,polars,datafusion]\n```\n\n## Understanding Ibis\n\nIbis provides a standard dataframe API decoupled from the execution engine. It\ncompiles Ibis expressions to a form of intermediary representation (often SQL)\nthat can be executed by different backends.\n\nThis allows us to write a single Ibis expression to complete the challenge with\nmany different execution engine backends.\n\n:::{.callout-warning}\nWhile Ibis does its best to abstract away the differences between backends, this\ncannot be done in some areas like data input and output. For example, the\n`read_csv` function across various backends (in their SQL and Python forms) have\ndifferent parameters. We'll handle that with different `kwargs` dictionaries for\nthese backends in this post.\n\nIn general, besides creating a connection and data input/output, the Ibis API is\nthe same across backends.\n:::\n\n## Completing the challenge thrice\n\nWe'll use three great options for local backends -- DuckDB, Polars, and\nDataFusion -- to complete the challenge.\n\n### Setup\n\nBefore we get started, we'll make some imports, turn on interactive mode, and\ndefine the `kwargs` dictionary for the backends corresponding to their\n`read_csv` function:\n\n::: {#175dc3ee .cell execution_count=1}\n``` {.python .cell-code}\nimport ibis\nimport polars as pl\nimport pyarrow as pa\n\nibis.options.interactive = True\n\nduckdb_kwargs = {\n \"delim\": \";\",\n \"header\": False,\n \"columns\": {\"station\": \"VARCHAR\", \"temperature\": \"DOUBLE\"},\n}\n\npolars_kwargs = {\n \"separator\": \";\",\n \"has_header\": False,\n \"new_columns\": [\"station\", \"temperature\"],\n \"schema\": {\"station\": pl.Utf8, \"temperature\": pl.Float64},\n}\n\ndatafusion_kwargs = {\n \"delimiter\": \";\",\n \"has_header\": False,\n \"schema\": pa.schema(\n [\n (\n \"station\",\n pa.string(),\n ),\n (\n \"temperature\",\n pa.float64(),\n ),\n ]\n ),\n \"file_extension\": \".txt\",\n}\n```\n:::\n\n\nLet's define a function to run the same code with each backend to complete the challenge:\n\n::: {#2869659a .cell execution_count=2}\n``` {.python .cell-code}\ndef run_challenge(t):\n res = (\n t.group_by(ibis._.station)\n .agg(\n min_temp=ibis._.temperature.min(),\n mean_temp=ibis._.temperature.mean(),\n max_temp=ibis._.temperature.max(),\n )\n .order_by(ibis._.station.desc())\n )\n return res\n```\n:::\n\n\n### Completing the challenge\n\nLet's complete the challenge with each backend.\n\n:::{.callout-note}\nThe results are the same across backends but look suspicious. It is noted in the\nrepository that the Python generation code is \"unofficial\", so may have some\nproblems. Given this is a contrived example of generated data, I'm not going to\nworry about it.\n\nThe point is that we can easily complete the challenge with the same code across\nmany backends, letting them worry about the details of execution. For this\nreason, I'm also not providing execution times. Try it out yourself!\n:::\n\n::: {.panel-tabset}\n\n## DuckDb\n\nFirst let's set the backend to DuckDB (redundantly since it's the default) and\nthe `kwargs` dictionary:\n\n::: {#5b598c15 .cell execution_count=3}\n``` {.python .cell-code}\nibis.set_backend(\"duckdb\") # <1>\nkwargs = duckdb_kwargs\n```\n:::\n\n\n1. Redundant given DuckDB is the default\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#5c1b91a8 .cell execution_count=4}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=4}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#95fa302a .cell execution_count=5}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=5}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#306280f8 .cell execution_count=6}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=6}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## Polars\n\nFirst let's set the backend to Polars and the `kwargs` dictionary:\n\n::: {#69bc0e91 .cell execution_count=7}\n``` {.python .cell-code}\nibis.set_backend(\"polars\") # <1>\nkwargs = polars_kwargs\n```\n:::\n\n\n1. Set Polars as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#5a489e2f .cell execution_count=8}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=8}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#2d525ffc .cell execution_count=9}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=9}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#c777efb6 .cell execution_count=10}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=10}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## DataFusion\n\nFirst let's set the backend to DataFusion and the `kwargs` dictionary:\n\n::: {#9f983c04 .cell execution_count=11}\n``` {.python .cell-code}\nibis.set_backend(\"datafusion\") # <1>\nkwargs = datafusion_kwargs\n```\n:::\n\n\n1. Set DataFusion as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#dc74b909 .cell execution_count=12}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=12}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#6eead7c5 .cell execution_count=13}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=13}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#20a95e7d .cell execution_count=14}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=14}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n:::\n\n## Bonus: more billion row data generation\n\nWhile we're here, I'll share the code I've used in the past to generate a\nbillion rows of random data:\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nsql_str = \"\"\nsql_str += \"select\\n\"\nfor c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1))):\n sql_str += f\" random() as {c},\\n\"\nsql_str += f\"from generate_series(1, {ROWS})\"\n\nt = con.sql(sql_str)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nNowadays I'd convert that to an Ibis expression:\n\n:::{.callout-note}\nThis is a slightly different result with a monotonic index column, but I prefer\nit anyway. You could drop that column or adjust the expression.\n:::\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nt = (\n ibis.range(ROWS)\n .unnest()\n .name(\"index\")\n .as_table()\n .mutate(**{c: ibis.random() for c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1)))})\n)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nBut if you do need to construct a programmatic SQL string, it's cool that you\ncan!\n\n## Conclusion\n\nWhile the one billion row challenge isn't a great benchmark, it's a fun way to\ndemonstrate how Ibis provides a single Python dataframe API to take the billion\nrow challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with\nother backends!\n\nHappy coding!\n\n", "supporting": [ - "index_files" + "index_files/figure-html" ], "filters": [], "includes": { @@ -12,7 +12,7 @@ "\n\n\n\n" ], "include-after-body": [ - "\n" + "\n" ] } } diff --git a/docs/posts/1brc/index.qmd b/docs/posts/1brc/index.qmd index c6368387ad8d..bf3816417a6e 100644 --- a/docs/posts/1brc/index.qmd +++ b/docs/posts/1brc/index.qmd @@ -333,4 +333,4 @@ demonstrate how Ibis provides a single Python dataframe API to take the billion row challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with other backends! -Happy coding! \ No newline at end of file +Happy coding! From 3451950b6598bb4778c3ae90c72298eea4725890 Mon Sep 17 00:00:00 2001 From: Cody Date: Thu, 18 Jan 2024 12:29:36 -0500 Subject: [PATCH 08/13] add repo link --- docs/_freeze/posts/1brc/index/execute-results/html.json | 9 +++------ docs/posts/1brc/index.qmd | 9 ++++++++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/docs/_freeze/posts/1brc/index/execute-results/html.json b/docs/_freeze/posts/1brc/index/execute-results/html.json index da968d9a32c1..02d2e06d5694 100644 --- a/docs/_freeze/posts/1brc/index/execute-results/html.json +++ b/docs/_freeze/posts/1brc/index/execute-results/html.json @@ -1,18 +1,15 @@ { - "hash": "071d60e8b7e35939bff2e30c0c26fa5a", + "hash": "111b00d73a1e825518aaeda4e5cbb1aa", "result": { "engine": "jupyter", - "markdown": "---\ntitle: \"Using one Python dataframe API to take the billion row challenge with DuckDB, Polars, and DataFusion\"\nauthor: \"Cody\"\ndate: \"2024-01-22\"\ncategories:\n - blog\n - duckdb\n - polars\n - datafusion\n---\n\n## Overview\n\nThis is an implementation of the [The One Billion Row\nChallenge](https://www.morling.dev/blog/one-billion-row-challenge/):\n\n> Let’s kick off 2024 true coder style—​I’m excited to announce the One Billion\n> Row Challenge (1BRC), running from Jan 1 until Jan 31.\n\n> Your mission, should you decide to accept it, is deceptively simple: write a\n> Java program for retrieving temperature measurement values from a text file and\n> calculating the min, mean, and max temperature per weather station. There’s just\n> one caveat: the file has 1,000,000,000 rows!\n\nI haven't written Java since dropping a computer science course my second year\nof college that forced us to do functional programming exclusively in Java.\nHowever, I'll gladly take the challenge in Python using Ibis! In fact, I did\nsomething like this (generating a billion rows with 26 columns of random numbers\nand doing basic aggregations) to test out DuckDB and Polars.\n\nIn this blog, we'll demonstrate how Ibis provides a single Python dataframe API\nto take the billion row challenge with DuckDB, Polars, and DataFusion.\n\n## Setup\n\nWe need to generate the data from the challenge. First, clone the repo:\n\n```{.bash}\ngh repo clone gunnarmorling/1brc\n```\n\nThen change into the Python directory and run the generation script with the\nnumber of rows you want to generate:\n\n```{.bash}\ncd 1brc/src/main/python\npython create_measurements.py 1_000_000_000\n```\n\nThis will generate a file called `measurements.txt` in the `data` directory at\nthe root of the repo. It is 15GB on disk:\n\n```{.bash}\n(venv) cody@voda 1brc % du 1brc/data/*\n 15G 1brc/data/measurements.txt\n808K 1brc/data/weather_stations.csv\n```\n\nAnd consists of one billion rows with two columns separated by a semicolon:\n\n```{.bash}\n(venv) cody@voda 1brc % head 1brc/data/measurements.txt\nKusugal;-67.2\nIpil;-88.6\nSohna;-31.2\nLubuagan;-2.3\nSzentes;29.2\nSylvan Lake;-70.7\nAmbato;-35.2\nBerkine;97.0\nWernau;73.4\nKennewick;-19.9\n```\n\nAlso, you'll need to install Ibis with the three backends we'll use:\n\n```{.bash}\npip install ibis-framework[duckdb,polars,datafusion]\n```\n\n## Understanding Ibis\n\nIbis provides a standard dataframe API decoupled from the execution engine. It\ncompiles Ibis expressions to a form of intermediary representation (often SQL)\nthat can be executed by different backends.\n\nThis allows us to write a single Ibis expression to complete the challenge with\nmany different execution engine backends.\n\n:::{.callout-warning}\nWhile Ibis does its best to abstract away the differences between backends, this\ncannot be done in some areas like data input and output. For example, the\n`read_csv` function across various backends (in their SQL and Python forms) have\ndifferent parameters. We'll handle that with different `kwargs` dictionaries for\nthese backends in this post.\n\nIn general, besides creating a connection and data input/output, the Ibis API is\nthe same across backends.\n:::\n\n## Completing the challenge thrice\n\nWe'll use three great options for local backends -- DuckDB, Polars, and\nDataFusion -- to complete the challenge.\n\n### Setup\n\nBefore we get started, we'll make some imports, turn on interactive mode, and\ndefine the `kwargs` dictionary for the backends corresponding to their\n`read_csv` function:\n\n::: {#175dc3ee .cell execution_count=1}\n``` {.python .cell-code}\nimport ibis\nimport polars as pl\nimport pyarrow as pa\n\nibis.options.interactive = True\n\nduckdb_kwargs = {\n \"delim\": \";\",\n \"header\": False,\n \"columns\": {\"station\": \"VARCHAR\", \"temperature\": \"DOUBLE\"},\n}\n\npolars_kwargs = {\n \"separator\": \";\",\n \"has_header\": False,\n \"new_columns\": [\"station\", \"temperature\"],\n \"schema\": {\"station\": pl.Utf8, \"temperature\": pl.Float64},\n}\n\ndatafusion_kwargs = {\n \"delimiter\": \";\",\n \"has_header\": False,\n \"schema\": pa.schema(\n [\n (\n \"station\",\n pa.string(),\n ),\n (\n \"temperature\",\n pa.float64(),\n ),\n ]\n ),\n \"file_extension\": \".txt\",\n}\n```\n:::\n\n\nLet's define a function to run the same code with each backend to complete the challenge:\n\n::: {#2869659a .cell execution_count=2}\n``` {.python .cell-code}\ndef run_challenge(t):\n res = (\n t.group_by(ibis._.station)\n .agg(\n min_temp=ibis._.temperature.min(),\n mean_temp=ibis._.temperature.mean(),\n max_temp=ibis._.temperature.max(),\n )\n .order_by(ibis._.station.desc())\n )\n return res\n```\n:::\n\n\n### Completing the challenge\n\nLet's complete the challenge with each backend.\n\n:::{.callout-note}\nThe results are the same across backends but look suspicious. It is noted in the\nrepository that the Python generation code is \"unofficial\", so may have some\nproblems. Given this is a contrived example of generated data, I'm not going to\nworry about it.\n\nThe point is that we can easily complete the challenge with the same code across\nmany backends, letting them worry about the details of execution. For this\nreason, I'm also not providing execution times. Try it out yourself!\n:::\n\n::: {.panel-tabset}\n\n## DuckDb\n\nFirst let's set the backend to DuckDB (redundantly since it's the default) and\nthe `kwargs` dictionary:\n\n::: {#5b598c15 .cell execution_count=3}\n``` {.python .cell-code}\nibis.set_backend(\"duckdb\") # <1>\nkwargs = duckdb_kwargs\n```\n:::\n\n\n1. Redundant given DuckDB is the default\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#5c1b91a8 .cell execution_count=4}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=4}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#95fa302a .cell execution_count=5}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=5}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#306280f8 .cell execution_count=6}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=6}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## Polars\n\nFirst let's set the backend to Polars and the `kwargs` dictionary:\n\n::: {#69bc0e91 .cell execution_count=7}\n``` {.python .cell-code}\nibis.set_backend(\"polars\") # <1>\nkwargs = polars_kwargs\n```\n:::\n\n\n1. Set Polars as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#5a489e2f .cell execution_count=8}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=8}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#2d525ffc .cell execution_count=9}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=9}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#c777efb6 .cell execution_count=10}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=10}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## DataFusion\n\nFirst let's set the backend to DataFusion and the `kwargs` dictionary:\n\n::: {#9f983c04 .cell execution_count=11}\n``` {.python .cell-code}\nibis.set_backend(\"datafusion\") # <1>\nkwargs = datafusion_kwargs\n```\n:::\n\n\n1. Set DataFusion as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#dc74b909 .cell execution_count=12}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=12}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#6eead7c5 .cell execution_count=13}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=13}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#20a95e7d .cell execution_count=14}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=14}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n:::\n\n## Bonus: more billion row data generation\n\nWhile we're here, I'll share the code I've used in the past to generate a\nbillion rows of random data:\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nsql_str = \"\"\nsql_str += \"select\\n\"\nfor c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1))):\n sql_str += f\" random() as {c},\\n\"\nsql_str += f\"from generate_series(1, {ROWS})\"\n\nt = con.sql(sql_str)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nNowadays I'd convert that to an Ibis expression:\n\n:::{.callout-note}\nThis is a slightly different result with a monotonic index column, but I prefer\nit anyway. You could drop that column or adjust the expression.\n:::\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nt = (\n ibis.range(ROWS)\n .unnest()\n .name(\"index\")\n .as_table()\n .mutate(**{c: ibis.random() for c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1)))})\n)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nBut if you do need to construct a programmatic SQL string, it's cool that you\ncan!\n\n## Conclusion\n\nWhile the one billion row challenge isn't a great benchmark, it's a fun way to\ndemonstrate how Ibis provides a single Python dataframe API to take the billion\nrow challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with\nother backends!\n\nHappy coding!\n\n", + "markdown": "---\ntitle: \"Using one Python dataframe API to take the billion row challenge with DuckDB, Polars, and DataFusion\"\nauthor: \"Cody\"\ndate: \"2024-01-22\"\ncategories:\n - blog\n - duckdb\n - polars\n - datafusion\n---\n\n## Overview\n\nThis is an implementation of the [The One Billion Row\nChallenge](https://www.morling.dev/blog/one-billion-row-challenge/):\n\n> Let’s kick off 2024 true coder style—​I’m excited to announce the One Billion\n> Row Challenge (1BRC), running from Jan 1 until Jan 31.\n\n> Your mission, should you decide to accept it, is deceptively simple: write a\n> Java program for retrieving temperature measurement values from a text file and\n> calculating the min, mean, and max temperature per weather station. There’s just\n> one caveat: the file has 1,000,000,000 rows!\n\nI haven't written Java since dropping a computer science course my second year\nof college that forced us to do functional programming exclusively in Java.\nHowever, I'll gladly take the challenge in Python using Ibis! In fact, I did\nsomething like this (generating a billion rows with 26 columns of random numbers\nand doing basic aggregations) to test out DuckDB and Polars.\n\nIn this blog, we'll demonstrate how Ibis provides a single Python dataframe API\nto take the billion row challenge with DuckDB, Polars, and DataFusion.\n\n## Setup\n\nWe need to generate the data from the challenge. First, clone the\n[repo](https://github.com/gunnarmorling/1brc):\n\n```{.bash}\ngh repo clone gunnarmorling/1brc\n```\n\nThen change into the Python directory and run the generation script with the\nnumber of rows you want to generate:\n\n```{.bash}\ncd 1brc/src/main/python\npython create_measurements.py 1_000_000_000\n```\n\nThis will generate a file called `measurements.txt` in the `data` directory at\nthe root of the repo. It is 15GB on disk:\n\n```{.bash}\n(venv) cody@voda 1brc % du 1brc/data/*\n 15G 1brc/data/measurements.txt\n808K 1brc/data/weather_stations.csv\n```\n\nAnd consists of one billion rows with two columns separated by a semicolon:\n\n```{.bash}\n(venv) cody@voda 1brc % head 1brc/data/measurements.txt\nKusugal;-67.2\nIpil;-88.6\nSohna;-31.2\nLubuagan;-2.3\nSzentes;29.2\nSylvan Lake;-70.7\nAmbato;-35.2\nBerkine;97.0\nWernau;73.4\nKennewick;-19.9\n```\n\nAlso, you'll need to install Ibis with the three backends we'll use:\n\n```{.bash}\npip install ibis-framework[duckdb,polars,datafusion]\n```\n\n## Understanding Ibis\n\nIbis provides a standard dataframe API decoupled from the execution engine. It\ncompiles Ibis expressions to a form of intermediary representation (often SQL)\nthat can be executed by different backends.\n\nThis allows us to write a single Ibis expression to complete the challenge with\nmany different execution engine backends.\n\n:::{.callout-warning}\nWhile Ibis does its best to abstract away the differences between backends, this\ncannot be done in some areas like data input and output. For example, the\n`read_csv` function across various backends (in their SQL and Python forms) have\ndifferent parameters. We'll handle that with different `kwargs` dictionaries for\nthese backends in this post.\n\nIn general, besides creating a connection and data input/output, the Ibis API is\nthe same across backends.\n:::\n\n## Completing the challenge thrice\n\nWe'll use three great options for local backends -- DuckDB, Polars, and\nDataFusion -- to complete the challenge.\n\n### Setup\n\nBefore we get started, we'll make some imports, turn on interactive mode, and\ndefine the `kwargs` dictionary for the backends corresponding to their\n`read_csv` function:\n\n::: {#a0de49bb .cell execution_count=1}\n``` {.python .cell-code}\nimport ibis\nimport polars as pl\nimport pyarrow as pa\n\nibis.options.interactive = True\n\nduckdb_kwargs = {\n \"delim\": \";\",\n \"header\": False,\n \"columns\": {\"station\": \"VARCHAR\", \"temperature\": \"DOUBLE\"},\n}\n\npolars_kwargs = {\n \"separator\": \";\",\n \"has_header\": False,\n \"new_columns\": [\"station\", \"temperature\"],\n \"schema\": {\"station\": pl.Utf8, \"temperature\": pl.Float64},\n}\n\ndatafusion_kwargs = {\n \"delimiter\": \";\",\n \"has_header\": False,\n \"schema\": pa.schema(\n [\n (\n \"station\",\n pa.string(),\n ),\n (\n \"temperature\",\n pa.float64(),\n ),\n ]\n ),\n \"file_extension\": \".txt\",\n}\n```\n:::\n\n\nLet's define a function to run the same code with each backend to complete the challenge:\n\n::: {#0bdcaf94 .cell execution_count=2}\n``` {.python .cell-code}\ndef run_challenge(t):\n res = (\n t.group_by(ibis._.station)\n .agg(\n min_temp=ibis._.temperature.min(),\n mean_temp=ibis._.temperature.mean(),\n max_temp=ibis._.temperature.max(),\n )\n .order_by(ibis._.station.desc())\n )\n return res\n```\n:::\n\n\n### Completing the challenge\n\nLet's complete the challenge with each backend.\n\n:::{.callout-note}\nThe results are the same across backends but look suspicious. It is noted in the\nrepository that the Python generation code is \"unofficial\", so may have some\nproblems. Given this is a contrived example of generated data, I'm not going to\nworry about it.\n\nThe point is that we can easily complete the challenge with the same code across\nmany backends, letting them worry about the details of execution. For this\nreason, I'm also not providing execution times. Try it out yourself!\n:::\n\n::: {.panel-tabset}\n\n## DuckDb\n\nFirst let's set the backend to DuckDB (redundantly since it's the default) and\nthe `kwargs` dictionary:\n\n::: {#db81b2d0 .cell execution_count=3}\n``` {.python .cell-code}\nibis.set_backend(\"duckdb\") # <1>\nkwargs = duckdb_kwargs\n```\n:::\n\n\n::: {#779c8b66 .cell execution_count=4}\n\n::: {.cell-output .cell-output-display execution_count=4}\n```\n\n```\n:::\n:::\n\n\n1. Redundant given DuckDB is the default\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#b7f39b43 .cell execution_count=5}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=5}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#2b918636 .cell execution_count=6}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=6}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#ee31ab49 .cell execution_count=7}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## Polars\n\nFirst let's set the backend to Polars and the `kwargs` dictionary:\n\n::: {#216a28df .cell execution_count=8}\n``` {.python .cell-code}\nibis.set_backend(\"polars\") # <1>\nkwargs = polars_kwargs\n```\n:::\n\n\n1. Set Polars as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#b31c3042 .cell execution_count=9}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=9}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#464e44df .cell execution_count=10}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=10}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#5b80e6c0 .cell execution_count=11}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=11}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## DataFusion\n\nFirst let's set the backend to DataFusion and the `kwargs` dictionary:\n\n::: {#9100f5cc .cell execution_count=12}\n``` {.python .cell-code}\nibis.set_backend(\"datafusion\") # <1>\nkwargs = datafusion_kwargs\n```\n:::\n\n\n1. Set DataFusion as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#ea0c19bf .cell execution_count=13}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=13}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#27301dbd .cell execution_count=14}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=14}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#cd2d7356 .cell execution_count=15}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=15}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n:::\n\n## Bonus: more billion row data generation\n\nWhile we're here, I'll share the code I've used in the past to generate a\nbillion rows of random data:\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nsql_str = \"\"\nsql_str += \"select\\n\"\nfor c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1))):\n sql_str += f\" random() as {c},\\n\"\nsql_str += f\"from generate_series(1, {ROWS})\"\n\nt = con.sql(sql_str)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nNowadays I'd convert that to an Ibis expression:\n\n:::{.callout-note}\nThis is a slightly different result with a monotonic index column, but I prefer\nit anyway. You could drop that column or adjust the expression.\n:::\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nt = (\n ibis.range(ROWS)\n .unnest()\n .name(\"index\")\n .as_table()\n .mutate(**{c: ibis.random() for c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1)))})\n)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nBut if you do need to construct a programmatic SQL string, it's cool that you\ncan!\n\n## Conclusion\n\nWhile the one billion row challenge isn't a great benchmark, it's a fun way to\ndemonstrate how Ibis provides a single Python dataframe API to take the billion\nrow challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with\nother backends!\n\nHappy coding!\n\n", "supporting": [ "index_files/figure-html" ], "filters": [], "includes": { "include-in-header": [ - "\n\n\n\n" - ], - "include-after-body": [ - "\n" + "\n\n\n" ] } } diff --git a/docs/posts/1brc/index.qmd b/docs/posts/1brc/index.qmd index bf3816417a6e..ae69de4265e5 100644 --- a/docs/posts/1brc/index.qmd +++ b/docs/posts/1brc/index.qmd @@ -33,7 +33,8 @@ to take the billion row challenge with DuckDB, Polars, and DataFusion. ## Setup -We need to generate the data from the challenge. First, clone the repo: +We need to generate the data from the challenge. First, clone the +[repo](https://github.com/gunnarmorling/1brc): ```{.bash} gh repo clone gunnarmorling/1brc @@ -191,6 +192,12 @@ ibis.set_backend("duckdb") # <1> kwargs = duckdb_kwargs ``` +```{python} +# | code-fold: true +# | echo: false +ibis.get_backend().raw_sql("set enable_progress_bar = false") +``` + 1. Redundant given DuckDB is the default Next, we'll read in the data and take a look at the table: From e2493a97dfadfb223716309786eecb8fc7f3bb63 Mon Sep 17 00:00:00 2001 From: Cody Date: Thu, 18 Jan 2024 13:39:24 -0500 Subject: [PATCH 09/13] hide output for some reason --- docs/_freeze/posts/1brc/index/execute-results/html.json | 4 ++-- docs/posts/1brc/index.qmd | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/_freeze/posts/1brc/index/execute-results/html.json b/docs/_freeze/posts/1brc/index/execute-results/html.json index 02d2e06d5694..2528b0f06ce1 100644 --- a/docs/_freeze/posts/1brc/index/execute-results/html.json +++ b/docs/_freeze/posts/1brc/index/execute-results/html.json @@ -1,8 +1,8 @@ { - "hash": "111b00d73a1e825518aaeda4e5cbb1aa", + "hash": "19c65dc8f0fa8b88b740595d2b41d781", "result": { "engine": "jupyter", - "markdown": "---\ntitle: \"Using one Python dataframe API to take the billion row challenge with DuckDB, Polars, and DataFusion\"\nauthor: \"Cody\"\ndate: \"2024-01-22\"\ncategories:\n - blog\n - duckdb\n - polars\n - datafusion\n---\n\n## Overview\n\nThis is an implementation of the [The One Billion Row\nChallenge](https://www.morling.dev/blog/one-billion-row-challenge/):\n\n> Let’s kick off 2024 true coder style—​I’m excited to announce the One Billion\n> Row Challenge (1BRC), running from Jan 1 until Jan 31.\n\n> Your mission, should you decide to accept it, is deceptively simple: write a\n> Java program for retrieving temperature measurement values from a text file and\n> calculating the min, mean, and max temperature per weather station. There’s just\n> one caveat: the file has 1,000,000,000 rows!\n\nI haven't written Java since dropping a computer science course my second year\nof college that forced us to do functional programming exclusively in Java.\nHowever, I'll gladly take the challenge in Python using Ibis! In fact, I did\nsomething like this (generating a billion rows with 26 columns of random numbers\nand doing basic aggregations) to test out DuckDB and Polars.\n\nIn this blog, we'll demonstrate how Ibis provides a single Python dataframe API\nto take the billion row challenge with DuckDB, Polars, and DataFusion.\n\n## Setup\n\nWe need to generate the data from the challenge. First, clone the\n[repo](https://github.com/gunnarmorling/1brc):\n\n```{.bash}\ngh repo clone gunnarmorling/1brc\n```\n\nThen change into the Python directory and run the generation script with the\nnumber of rows you want to generate:\n\n```{.bash}\ncd 1brc/src/main/python\npython create_measurements.py 1_000_000_000\n```\n\nThis will generate a file called `measurements.txt` in the `data` directory at\nthe root of the repo. It is 15GB on disk:\n\n```{.bash}\n(venv) cody@voda 1brc % du 1brc/data/*\n 15G 1brc/data/measurements.txt\n808K 1brc/data/weather_stations.csv\n```\n\nAnd consists of one billion rows with two columns separated by a semicolon:\n\n```{.bash}\n(venv) cody@voda 1brc % head 1brc/data/measurements.txt\nKusugal;-67.2\nIpil;-88.6\nSohna;-31.2\nLubuagan;-2.3\nSzentes;29.2\nSylvan Lake;-70.7\nAmbato;-35.2\nBerkine;97.0\nWernau;73.4\nKennewick;-19.9\n```\n\nAlso, you'll need to install Ibis with the three backends we'll use:\n\n```{.bash}\npip install ibis-framework[duckdb,polars,datafusion]\n```\n\n## Understanding Ibis\n\nIbis provides a standard dataframe API decoupled from the execution engine. It\ncompiles Ibis expressions to a form of intermediary representation (often SQL)\nthat can be executed by different backends.\n\nThis allows us to write a single Ibis expression to complete the challenge with\nmany different execution engine backends.\n\n:::{.callout-warning}\nWhile Ibis does its best to abstract away the differences between backends, this\ncannot be done in some areas like data input and output. For example, the\n`read_csv` function across various backends (in their SQL and Python forms) have\ndifferent parameters. We'll handle that with different `kwargs` dictionaries for\nthese backends in this post.\n\nIn general, besides creating a connection and data input/output, the Ibis API is\nthe same across backends.\n:::\n\n## Completing the challenge thrice\n\nWe'll use three great options for local backends -- DuckDB, Polars, and\nDataFusion -- to complete the challenge.\n\n### Setup\n\nBefore we get started, we'll make some imports, turn on interactive mode, and\ndefine the `kwargs` dictionary for the backends corresponding to their\n`read_csv` function:\n\n::: {#a0de49bb .cell execution_count=1}\n``` {.python .cell-code}\nimport ibis\nimport polars as pl\nimport pyarrow as pa\n\nibis.options.interactive = True\n\nduckdb_kwargs = {\n \"delim\": \";\",\n \"header\": False,\n \"columns\": {\"station\": \"VARCHAR\", \"temperature\": \"DOUBLE\"},\n}\n\npolars_kwargs = {\n \"separator\": \";\",\n \"has_header\": False,\n \"new_columns\": [\"station\", \"temperature\"],\n \"schema\": {\"station\": pl.Utf8, \"temperature\": pl.Float64},\n}\n\ndatafusion_kwargs = {\n \"delimiter\": \";\",\n \"has_header\": False,\n \"schema\": pa.schema(\n [\n (\n \"station\",\n pa.string(),\n ),\n (\n \"temperature\",\n pa.float64(),\n ),\n ]\n ),\n \"file_extension\": \".txt\",\n}\n```\n:::\n\n\nLet's define a function to run the same code with each backend to complete the challenge:\n\n::: {#0bdcaf94 .cell execution_count=2}\n``` {.python .cell-code}\ndef run_challenge(t):\n res = (\n t.group_by(ibis._.station)\n .agg(\n min_temp=ibis._.temperature.min(),\n mean_temp=ibis._.temperature.mean(),\n max_temp=ibis._.temperature.max(),\n )\n .order_by(ibis._.station.desc())\n )\n return res\n```\n:::\n\n\n### Completing the challenge\n\nLet's complete the challenge with each backend.\n\n:::{.callout-note}\nThe results are the same across backends but look suspicious. It is noted in the\nrepository that the Python generation code is \"unofficial\", so may have some\nproblems. Given this is a contrived example of generated data, I'm not going to\nworry about it.\n\nThe point is that we can easily complete the challenge with the same code across\nmany backends, letting them worry about the details of execution. For this\nreason, I'm also not providing execution times. Try it out yourself!\n:::\n\n::: {.panel-tabset}\n\n## DuckDb\n\nFirst let's set the backend to DuckDB (redundantly since it's the default) and\nthe `kwargs` dictionary:\n\n::: {#db81b2d0 .cell execution_count=3}\n``` {.python .cell-code}\nibis.set_backend(\"duckdb\") # <1>\nkwargs = duckdb_kwargs\n```\n:::\n\n\n::: {#779c8b66 .cell execution_count=4}\n\n::: {.cell-output .cell-output-display execution_count=4}\n```\n\n```\n:::\n:::\n\n\n1. Redundant given DuckDB is the default\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#b7f39b43 .cell execution_count=5}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=5}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#2b918636 .cell execution_count=6}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=6}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#ee31ab49 .cell execution_count=7}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## Polars\n\nFirst let's set the backend to Polars and the `kwargs` dictionary:\n\n::: {#216a28df .cell execution_count=8}\n``` {.python .cell-code}\nibis.set_backend(\"polars\") # <1>\nkwargs = polars_kwargs\n```\n:::\n\n\n1. Set Polars as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#b31c3042 .cell execution_count=9}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=9}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#464e44df .cell execution_count=10}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=10}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#5b80e6c0 .cell execution_count=11}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=11}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## DataFusion\n\nFirst let's set the backend to DataFusion and the `kwargs` dictionary:\n\n::: {#9100f5cc .cell execution_count=12}\n``` {.python .cell-code}\nibis.set_backend(\"datafusion\") # <1>\nkwargs = datafusion_kwargs\n```\n:::\n\n\n1. Set DataFusion as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#ea0c19bf .cell execution_count=13}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=13}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#27301dbd .cell execution_count=14}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=14}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#cd2d7356 .cell execution_count=15}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=15}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n:::\n\n## Bonus: more billion row data generation\n\nWhile we're here, I'll share the code I've used in the past to generate a\nbillion rows of random data:\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nsql_str = \"\"\nsql_str += \"select\\n\"\nfor c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1))):\n sql_str += f\" random() as {c},\\n\"\nsql_str += f\"from generate_series(1, {ROWS})\"\n\nt = con.sql(sql_str)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nNowadays I'd convert that to an Ibis expression:\n\n:::{.callout-note}\nThis is a slightly different result with a monotonic index column, but I prefer\nit anyway. You could drop that column or adjust the expression.\n:::\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nt = (\n ibis.range(ROWS)\n .unnest()\n .name(\"index\")\n .as_table()\n .mutate(**{c: ibis.random() for c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1)))})\n)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nBut if you do need to construct a programmatic SQL string, it's cool that you\ncan!\n\n## Conclusion\n\nWhile the one billion row challenge isn't a great benchmark, it's a fun way to\ndemonstrate how Ibis provides a single Python dataframe API to take the billion\nrow challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with\nother backends!\n\nHappy coding!\n\n", + "markdown": "---\ntitle: \"Using one Python dataframe API to take the billion row challenge with DuckDB, Polars, and DataFusion\"\nauthor: \"Cody\"\ndate: \"2024-01-22\"\ncategories:\n - blog\n - duckdb\n - polars\n - datafusion\n---\n\n## Overview\n\nThis is an implementation of the [The One Billion Row\nChallenge](https://www.morling.dev/blog/one-billion-row-challenge/):\n\n> Let’s kick off 2024 true coder style—​I’m excited to announce the One Billion\n> Row Challenge (1BRC), running from Jan 1 until Jan 31.\n\n> Your mission, should you decide to accept it, is deceptively simple: write a\n> Java program for retrieving temperature measurement values from a text file and\n> calculating the min, mean, and max temperature per weather station. There’s just\n> one caveat: the file has 1,000,000,000 rows!\n\nI haven't written Java since dropping a computer science course my second year\nof college that forced us to do functional programming exclusively in Java.\nHowever, I'll gladly take the challenge in Python using Ibis! In fact, I did\nsomething like this (generating a billion rows with 26 columns of random numbers\nand doing basic aggregations) to test out DuckDB and Polars.\n\nIn this blog, we'll demonstrate how Ibis provides a single Python dataframe API\nto take the billion row challenge with DuckDB, Polars, and DataFusion.\n\n## Setup\n\nWe need to generate the data from the challenge. First, clone the\n[repo](https://github.com/gunnarmorling/1brc):\n\n```{.bash}\ngh repo clone gunnarmorling/1brc\n```\n\nThen change into the Python directory and run the generation script with the\nnumber of rows you want to generate:\n\n```{.bash}\ncd 1brc/src/main/python\npython create_measurements.py 1_000_000_000\n```\n\nThis will generate a file called `measurements.txt` in the `data` directory at\nthe root of the repo. It is 15GB on disk:\n\n```{.bash}\n(venv) cody@voda 1brc % du 1brc/data/*\n 15G 1brc/data/measurements.txt\n808K 1brc/data/weather_stations.csv\n```\n\nAnd consists of one billion rows with two columns separated by a semicolon:\n\n```{.bash}\n(venv) cody@voda 1brc % head 1brc/data/measurements.txt\nKusugal;-67.2\nIpil;-88.6\nSohna;-31.2\nLubuagan;-2.3\nSzentes;29.2\nSylvan Lake;-70.7\nAmbato;-35.2\nBerkine;97.0\nWernau;73.4\nKennewick;-19.9\n```\n\nAlso, you'll need to install Ibis with the three backends we'll use:\n\n```{.bash}\npip install ibis-framework[duckdb,polars,datafusion]\n```\n\n## Understanding Ibis\n\nIbis provides a standard dataframe API decoupled from the execution engine. It\ncompiles Ibis expressions to a form of intermediary representation (often SQL)\nthat can be executed by different backends.\n\nThis allows us to write a single Ibis expression to complete the challenge with\nmany different execution engine backends.\n\n:::{.callout-warning}\nWhile Ibis does its best to abstract away the differences between backends, this\ncannot be done in some areas like data input and output. For example, the\n`read_csv` function across various backends (in their SQL and Python forms) have\ndifferent parameters. We'll handle that with different `kwargs` dictionaries for\nthese backends in this post.\n\nIn general, besides creating a connection and data input/output, the Ibis API is\nthe same across backends.\n:::\n\n## Completing the challenge thrice\n\nWe'll use three great options for local backends -- DuckDB, Polars, and\nDataFusion -- to complete the challenge.\n\n### Setup\n\nBefore we get started, we'll make some imports, turn on interactive mode, and\ndefine the `kwargs` dictionary for the backends corresponding to their\n`read_csv` function:\n\n::: {#381119d9 .cell execution_count=1}\n``` {.python .cell-code}\nimport ibis\nimport polars as pl\nimport pyarrow as pa\n\nibis.options.interactive = True\n\nduckdb_kwargs = {\n \"delim\": \";\",\n \"header\": False,\n \"columns\": {\"station\": \"VARCHAR\", \"temperature\": \"DOUBLE\"},\n}\n\npolars_kwargs = {\n \"separator\": \";\",\n \"has_header\": False,\n \"new_columns\": [\"station\", \"temperature\"],\n \"schema\": {\"station\": pl.Utf8, \"temperature\": pl.Float64},\n}\n\ndatafusion_kwargs = {\n \"delimiter\": \";\",\n \"has_header\": False,\n \"schema\": pa.schema(\n [\n (\n \"station\",\n pa.string(),\n ),\n (\n \"temperature\",\n pa.float64(),\n ),\n ]\n ),\n \"file_extension\": \".txt\",\n}\n```\n:::\n\n\nLet's define a function to run the same code with each backend to complete the challenge:\n\n::: {#9a9ac05c .cell execution_count=2}\n``` {.python .cell-code}\ndef run_challenge(t):\n res = (\n t.group_by(ibis._.station)\n .agg(\n min_temp=ibis._.temperature.min(),\n mean_temp=ibis._.temperature.mean(),\n max_temp=ibis._.temperature.max(),\n )\n .order_by(ibis._.station.desc())\n )\n return res\n```\n:::\n\n\n### Completing the challenge\n\nLet's complete the challenge with each backend.\n\n:::{.callout-note}\nThe results are the same across backends but look suspicious. It is noted in the\nrepository that the Python generation code is \"unofficial\", so may have some\nproblems. Given this is a contrived example of generated data, I'm not going to\nworry about it.\n\nThe point is that we can easily complete the challenge with the same code across\nmany backends, letting them worry about the details of execution. For this\nreason, I'm also not providing execution times. Try it out yourself!\n:::\n\n::: {.panel-tabset}\n\n## DuckDb\n\nFirst let's set the backend to DuckDB (redundantly since it's the default) and\nthe `kwargs` dictionary:\n\n::: {#69d5adce .cell execution_count=3}\n``` {.python .cell-code}\nibis.set_backend(\"duckdb\") # <1>\nkwargs = duckdb_kwargs\n```\n:::\n\n\n\n\n1. Redundant given DuckDB is the default\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#6aaae280 .cell execution_count=5}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=5}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#e3c0b8ba .cell execution_count=6}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=6}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#75349847 .cell execution_count=7}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## Polars\n\nFirst let's set the backend to Polars and the `kwargs` dictionary:\n\n::: {#5b883812 .cell execution_count=8}\n``` {.python .cell-code}\nibis.set_backend(\"polars\") # <1>\nkwargs = polars_kwargs\n```\n:::\n\n\n1. Set Polars as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#50e65739 .cell execution_count=9}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=9}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#8c5f9751 .cell execution_count=10}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=10}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#c9f13117 .cell execution_count=11}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=11}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## DataFusion\n\nFirst let's set the backend to DataFusion and the `kwargs` dictionary:\n\n::: {#c6681a2a .cell execution_count=12}\n``` {.python .cell-code}\nibis.set_backend(\"datafusion\") # <1>\nkwargs = datafusion_kwargs\n```\n:::\n\n\n1. Set DataFusion as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#23dc7a3e .cell execution_count=13}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=13}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#a67dcf75 .cell execution_count=14}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=14}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#6588be46 .cell execution_count=15}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=15}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n:::\n\n## Bonus: more billion row data generation\n\nWhile we're here, I'll share the code I've used in the past to generate a\nbillion rows of random data:\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nsql_str = \"\"\nsql_str += \"select\\n\"\nfor c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1))):\n sql_str += f\" random() as {c},\\n\"\nsql_str += f\"from generate_series(1, {ROWS})\"\n\nt = con.sql(sql_str)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nNowadays I'd convert that to an Ibis expression:\n\n:::{.callout-note}\nThis is a slightly different result with a monotonic index column, but I prefer\nit anyway. You could drop that column or adjust the expression.\n:::\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nt = (\n ibis.range(ROWS)\n .unnest()\n .name(\"index\")\n .as_table()\n .mutate(**{c: ibis.random() for c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1)))})\n)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nBut if you do need to construct a programmatic SQL string, it's cool that you\ncan!\n\n## Conclusion\n\nWhile the one billion row challenge isn't a great benchmark, it's a fun way to\ndemonstrate how Ibis provides a single Python dataframe API to take the billion\nrow challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with\nother backends!\n\nHappy coding!\n\n", "supporting": [ "index_files/figure-html" ], diff --git a/docs/posts/1brc/index.qmd b/docs/posts/1brc/index.qmd index ae69de4265e5..27b2679053e6 100644 --- a/docs/posts/1brc/index.qmd +++ b/docs/posts/1brc/index.qmd @@ -195,7 +195,7 @@ kwargs = duckdb_kwargs ```{python} # | code-fold: true # | echo: false -ibis.get_backend().raw_sql("set enable_progress_bar = false") +_ = ibis.get_backend().raw_sql("set enable_progress_bar = false") ``` 1. Redundant given DuckDB is the default From a1c7d753c3d0fa37b34413a79f17f4e6c7720780 Mon Sep 17 00:00:00 2001 From: Cody Date: Thu, 18 Jan 2024 16:48:48 -0500 Subject: [PATCH 10/13] freeze --- docs/_freeze/posts/1brc/index/execute-results/html.json | 4 ++-- docs/posts/1brc/index.qmd | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/_freeze/posts/1brc/index/execute-results/html.json b/docs/_freeze/posts/1brc/index/execute-results/html.json index 2528b0f06ce1..aca68e7c5718 100644 --- a/docs/_freeze/posts/1brc/index/execute-results/html.json +++ b/docs/_freeze/posts/1brc/index/execute-results/html.json @@ -1,8 +1,8 @@ { - "hash": "19c65dc8f0fa8b88b740595d2b41d781", + "hash": "aadeb5edf1fc2747b18d8a1f77ef29b9", "result": { "engine": "jupyter", - "markdown": "---\ntitle: \"Using one Python dataframe API to take the billion row challenge with DuckDB, Polars, and DataFusion\"\nauthor: \"Cody\"\ndate: \"2024-01-22\"\ncategories:\n - blog\n - duckdb\n - polars\n - datafusion\n---\n\n## Overview\n\nThis is an implementation of the [The One Billion Row\nChallenge](https://www.morling.dev/blog/one-billion-row-challenge/):\n\n> Let’s kick off 2024 true coder style—​I’m excited to announce the One Billion\n> Row Challenge (1BRC), running from Jan 1 until Jan 31.\n\n> Your mission, should you decide to accept it, is deceptively simple: write a\n> Java program for retrieving temperature measurement values from a text file and\n> calculating the min, mean, and max temperature per weather station. There’s just\n> one caveat: the file has 1,000,000,000 rows!\n\nI haven't written Java since dropping a computer science course my second year\nof college that forced us to do functional programming exclusively in Java.\nHowever, I'll gladly take the challenge in Python using Ibis! In fact, I did\nsomething like this (generating a billion rows with 26 columns of random numbers\nand doing basic aggregations) to test out DuckDB and Polars.\n\nIn this blog, we'll demonstrate how Ibis provides a single Python dataframe API\nto take the billion row challenge with DuckDB, Polars, and DataFusion.\n\n## Setup\n\nWe need to generate the data from the challenge. First, clone the\n[repo](https://github.com/gunnarmorling/1brc):\n\n```{.bash}\ngh repo clone gunnarmorling/1brc\n```\n\nThen change into the Python directory and run the generation script with the\nnumber of rows you want to generate:\n\n```{.bash}\ncd 1brc/src/main/python\npython create_measurements.py 1_000_000_000\n```\n\nThis will generate a file called `measurements.txt` in the `data` directory at\nthe root of the repo. It is 15GB on disk:\n\n```{.bash}\n(venv) cody@voda 1brc % du 1brc/data/*\n 15G 1brc/data/measurements.txt\n808K 1brc/data/weather_stations.csv\n```\n\nAnd consists of one billion rows with two columns separated by a semicolon:\n\n```{.bash}\n(venv) cody@voda 1brc % head 1brc/data/measurements.txt\nKusugal;-67.2\nIpil;-88.6\nSohna;-31.2\nLubuagan;-2.3\nSzentes;29.2\nSylvan Lake;-70.7\nAmbato;-35.2\nBerkine;97.0\nWernau;73.4\nKennewick;-19.9\n```\n\nAlso, you'll need to install Ibis with the three backends we'll use:\n\n```{.bash}\npip install ibis-framework[duckdb,polars,datafusion]\n```\n\n## Understanding Ibis\n\nIbis provides a standard dataframe API decoupled from the execution engine. It\ncompiles Ibis expressions to a form of intermediary representation (often SQL)\nthat can be executed by different backends.\n\nThis allows us to write a single Ibis expression to complete the challenge with\nmany different execution engine backends.\n\n:::{.callout-warning}\nWhile Ibis does its best to abstract away the differences between backends, this\ncannot be done in some areas like data input and output. For example, the\n`read_csv` function across various backends (in their SQL and Python forms) have\ndifferent parameters. We'll handle that with different `kwargs` dictionaries for\nthese backends in this post.\n\nIn general, besides creating a connection and data input/output, the Ibis API is\nthe same across backends.\n:::\n\n## Completing the challenge thrice\n\nWe'll use three great options for local backends -- DuckDB, Polars, and\nDataFusion -- to complete the challenge.\n\n### Setup\n\nBefore we get started, we'll make some imports, turn on interactive mode, and\ndefine the `kwargs` dictionary for the backends corresponding to their\n`read_csv` function:\n\n::: {#381119d9 .cell execution_count=1}\n``` {.python .cell-code}\nimport ibis\nimport polars as pl\nimport pyarrow as pa\n\nibis.options.interactive = True\n\nduckdb_kwargs = {\n \"delim\": \";\",\n \"header\": False,\n \"columns\": {\"station\": \"VARCHAR\", \"temperature\": \"DOUBLE\"},\n}\n\npolars_kwargs = {\n \"separator\": \";\",\n \"has_header\": False,\n \"new_columns\": [\"station\", \"temperature\"],\n \"schema\": {\"station\": pl.Utf8, \"temperature\": pl.Float64},\n}\n\ndatafusion_kwargs = {\n \"delimiter\": \";\",\n \"has_header\": False,\n \"schema\": pa.schema(\n [\n (\n \"station\",\n pa.string(),\n ),\n (\n \"temperature\",\n pa.float64(),\n ),\n ]\n ),\n \"file_extension\": \".txt\",\n}\n```\n:::\n\n\nLet's define a function to run the same code with each backend to complete the challenge:\n\n::: {#9a9ac05c .cell execution_count=2}\n``` {.python .cell-code}\ndef run_challenge(t):\n res = (\n t.group_by(ibis._.station)\n .agg(\n min_temp=ibis._.temperature.min(),\n mean_temp=ibis._.temperature.mean(),\n max_temp=ibis._.temperature.max(),\n )\n .order_by(ibis._.station.desc())\n )\n return res\n```\n:::\n\n\n### Completing the challenge\n\nLet's complete the challenge with each backend.\n\n:::{.callout-note}\nThe results are the same across backends but look suspicious. It is noted in the\nrepository that the Python generation code is \"unofficial\", so may have some\nproblems. Given this is a contrived example of generated data, I'm not going to\nworry about it.\n\nThe point is that we can easily complete the challenge with the same code across\nmany backends, letting them worry about the details of execution. For this\nreason, I'm also not providing execution times. Try it out yourself!\n:::\n\n::: {.panel-tabset}\n\n## DuckDb\n\nFirst let's set the backend to DuckDB (redundantly since it's the default) and\nthe `kwargs` dictionary:\n\n::: {#69d5adce .cell execution_count=3}\n``` {.python .cell-code}\nibis.set_backend(\"duckdb\") # <1>\nkwargs = duckdb_kwargs\n```\n:::\n\n\n\n\n1. Redundant given DuckDB is the default\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#6aaae280 .cell execution_count=5}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=5}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#e3c0b8ba .cell execution_count=6}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=6}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#75349847 .cell execution_count=7}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## Polars\n\nFirst let's set the backend to Polars and the `kwargs` dictionary:\n\n::: {#5b883812 .cell execution_count=8}\n``` {.python .cell-code}\nibis.set_backend(\"polars\") # <1>\nkwargs = polars_kwargs\n```\n:::\n\n\n1. Set Polars as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#50e65739 .cell execution_count=9}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=9}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#8c5f9751 .cell execution_count=10}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=10}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#c9f13117 .cell execution_count=11}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=11}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## DataFusion\n\nFirst let's set the backend to DataFusion and the `kwargs` dictionary:\n\n::: {#c6681a2a .cell execution_count=12}\n``` {.python .cell-code}\nibis.set_backend(\"datafusion\") # <1>\nkwargs = datafusion_kwargs\n```\n:::\n\n\n1. Set DataFusion as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#23dc7a3e .cell execution_count=13}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=13}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#a67dcf75 .cell execution_count=14}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=14}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#6588be46 .cell execution_count=15}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=15}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n:::\n\n## Bonus: more billion row data generation\n\nWhile we're here, I'll share the code I've used in the past to generate a\nbillion rows of random data:\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nsql_str = \"\"\nsql_str += \"select\\n\"\nfor c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1))):\n sql_str += f\" random() as {c},\\n\"\nsql_str += f\"from generate_series(1, {ROWS})\"\n\nt = con.sql(sql_str)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nNowadays I'd convert that to an Ibis expression:\n\n:::{.callout-note}\nThis is a slightly different result with a monotonic index column, but I prefer\nit anyway. You could drop that column or adjust the expression.\n:::\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nt = (\n ibis.range(ROWS)\n .unnest()\n .name(\"index\")\n .as_table()\n .mutate(**{c: ibis.random() for c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1)))})\n)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nBut if you do need to construct a programmatic SQL string, it's cool that you\ncan!\n\n## Conclusion\n\nWhile the one billion row challenge isn't a great benchmark, it's a fun way to\ndemonstrate how Ibis provides a single Python dataframe API to take the billion\nrow challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with\nother backends!\n\nHappy coding!\n\n", + "markdown": "---\ntitle: \"Using one Python dataframe API to take the billion row challenge with DuckDB, Polars, and DataFusion\"\nauthor: \"Cody\"\ndate: \"2024-01-22\"\ncategories:\n - blog\n - duckdb\n - polars\n - datafusion\n---\n\n## Overview\n\nThis is an implementation of the [The One Billion Row\nChallenge](https://www.morling.dev/blog/one-billion-row-challenge/):\n\n> Let’s kick off 2024 true coder style—​I’m excited to announce the One Billion\n> Row Challenge (1BRC), running from Jan 1 until Jan 31.\n\n> Your mission, should you decide to accept it, is deceptively simple: write a\n> Java program for retrieving temperature measurement values from a text file and\n> calculating the min, mean, and max temperature per weather station. There’s just\n> one caveat: the file has 1,000,000,000 rows!\n\nI haven't written Java since dropping a computer science course my second year\nof college that forced us to do functional programming exclusively in Java.\nHowever, I'll gladly take the challenge in Python using Ibis! In fact, I did\nsomething like this (generating a billion rows with 26 columns of random numbers\nand doing basic aggregations) to test out DuckDB and Polars.\n\nIn this blog, we'll demonstrate how Ibis provides a single Python dataframe API\nto take the billion row challenge with DuckDB, Polars, and DataFusion.\n\n## Setup\n\nWe need to generate the data from the challenge. First, clone the\n[repo](https://github.com/gunnarmorling/1brc):\n\n```{.bash}\ngh repo clone gunnarmorling/1brc\n```\n\nThen change into the Python directory and run the generation script with the\nnumber of rows you want to generate:\n\n```{.bash}\ncd 1brc/src/main/python\npython create_measurements.py 1_000_000_000\n```\n\nThis will generate a file called `measurements.txt` in the `data` directory at\nthe root of the repo. It is 15GB on disk:\n\n```{.bash}\n(venv) cody@voda 1brc % du 1brc/data/*\n 15G 1brc/data/measurements.txt\n808K 1brc/data/weather_stations.csv\n```\n\nAnd consists of one billion rows with two columns separated by a semicolon:\n\n```{.bash}\n(venv) cody@voda 1brc % head 1brc/data/measurements.txt\nKusugal;-67.2\nIpil;-88.6\nSohna;-31.2\nLubuagan;-2.3\nSzentes;29.2\nSylvan Lake;-70.7\nAmbato;-35.2\nBerkine;97.0\nWernau;73.4\nKennewick;-19.9\n```\n\nAlso, you'll need to install Ibis with the three backends we'll use:\n\n```{.bash}\npip install ibis-framework[duckdb,polars,datafusion]\n```\n\n## Understanding Ibis\n\nIbis provides a standard dataframe API decoupled from the execution engine. It\ncompiles Ibis expressions to a form of intermediary representation (often SQL)\nthat can be executed by different backends.\n\nThis allows us to write a single Ibis expression to complete the challenge with\nmany different execution engine backends.\n\n:::{.callout-warning}\nWhile Ibis does its best to abstract away the differences between backends, this\ncannot be done in some areas like data input and output. For example, the\n`read_csv` function across various backends (in their SQL and Python forms) have\ndifferent parameters. We'll handle that with different `kwargs` dictionaries for\nthese backends in this post.\n\nIn general, besides creating a connection and data input/output, the Ibis API is\nthe same across backends.\n:::\n\n## Completing the challenge thrice\n\nWe'll use three great options for local backends -- DuckDB, Polars, and\nDataFusion -- to complete the challenge.\n\n### Setup\n\nBefore we get started, we'll make some imports, turn on interactive mode, and\ndefine the `kwargs` dictionary for the backends corresponding to their\n`read_csv` function:\n\n::: {#dff7ea8d .cell execution_count=1}\n``` {.python .cell-code}\nimport ibis\nimport polars as pl\nimport pyarrow as pa\n\nibis.options.interactive = True\n\nduckdb_kwargs = {\n \"delim\": \";\",\n \"header\": False,\n \"columns\": {\"station\": \"VARCHAR\", \"temperature\": \"DOUBLE\"},\n}\n\npolars_kwargs = {\n \"separator\": \";\",\n \"has_header\": False,\n \"new_columns\": [\"station\", \"temperature\"],\n \"schema\": {\"station\": pl.Utf8, \"temperature\": pl.Float64},\n}\n\ndatafusion_kwargs = {\n \"delimiter\": \";\",\n \"has_header\": False,\n \"schema\": pa.schema(\n [\n (\n \"station\",\n pa.string(),\n ),\n (\n \"temperature\",\n pa.float64(),\n ),\n ]\n ),\n \"file_extension\": \".txt\",\n}\n```\n:::\n\n\nLet's define a function to run the same code with each backend to complete the challenge:\n\n::: {#8257f94c .cell execution_count=2}\n``` {.python .cell-code}\ndef run_challenge(t):\n res = (\n t.group_by(ibis._.station)\n .agg(\n min_temp=ibis._.temperature.min(),\n mean_temp=ibis._.temperature.mean(),\n max_temp=ibis._.temperature.max(),\n )\n .order_by(ibis._.station.desc())\n )\n return res\n```\n:::\n\n\n### Completing the challenge\n\nLet's complete the challenge with each backend.\n\n:::{.callout-note}\nThe results are the same across backends but look suspicious. It is noted in the\nrepository that the Python generation code is \"unofficial\", so may have some\nproblems. Given this is a contrived example of generated data, I'm not going to\nworry about it.\n\nThe point is that we can easily complete the challenge with the same code across\nmany backends, letting them worry about the details of execution. For this\nreason, I'm also not providing execution times. Try it out yourself!\n:::\n\n::: {.panel-tabset}\n\n## DuckDB\n\nFirst let's set the backend to DuckDB (redundantly since it's the default) and\nthe `kwargs` dictionary:\n\n::: {#cea6f439 .cell execution_count=3}\n``` {.python .cell-code}\nibis.set_backend(\"duckdb\") # <1>\nkwargs = duckdb_kwargs\n```\n:::\n\n\n\n\n1. Redundant given DuckDB is the default\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#193a3ce5 .cell execution_count=5}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=5}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#c96829d6 .cell execution_count=6}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=6}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#c951569b .cell execution_count=7}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## Polars\n\nFirst let's set the backend to Polars and the `kwargs` dictionary:\n\n::: {#ee718894 .cell execution_count=8}\n``` {.python .cell-code}\nibis.set_backend(\"polars\") # <1>\nkwargs = polars_kwargs\n```\n:::\n\n\n1. Set Polars as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#c4ee239c .cell execution_count=9}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=9}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#0dc00b81 .cell execution_count=10}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=10}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#8df0469a .cell execution_count=11}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=11}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## DataFusion\n\nFirst let's set the backend to DataFusion and the `kwargs` dictionary:\n\n::: {#d14fb112 .cell execution_count=12}\n``` {.python .cell-code}\nibis.set_backend(\"datafusion\") # <1>\nkwargs = datafusion_kwargs\n```\n:::\n\n\n1. Set DataFusion as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#922b3591 .cell execution_count=13}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=13}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#2b50efa1 .cell execution_count=14}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=14}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#d43becb7 .cell execution_count=15}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=15}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n:::\n\n## Bonus: more billion row data generation\n\nWhile we're here, I'll share the code I've used in the past to generate a\nbillion rows of random data:\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nsql_str = \"\"\nsql_str += \"select\\n\"\nfor c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1))):\n sql_str += f\" random() as {c},\\n\"\nsql_str += f\"from generate_series(1, {ROWS})\"\n\nt = con.sql(sql_str)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nNowadays I'd convert that to an Ibis expression:\n\n:::{.callout-note}\nThis is a slightly different result with a monotonic index column, but I prefer\nit anyway. You could drop that column or adjust the expression.\n:::\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nt = (\n ibis.range(ROWS)\n .unnest()\n .name(\"index\")\n .as_table()\n .mutate(**{c: ibis.random() for c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1)))})\n)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nBut if you do need to construct a programmatic SQL string, it's cool that you\ncan!\n\n## Conclusion\n\nWhile the one billion row challenge isn't a great benchmark, it's a fun way to\ndemonstrate how Ibis provides a single Python dataframe API to take the billion\nrow challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with\nother backends!\n\nHappy coding!\n\n", "supporting": [ "index_files/figure-html" ], diff --git a/docs/posts/1brc/index.qmd b/docs/posts/1brc/index.qmd index 27b2679053e6..ada9a116f9b8 100644 --- a/docs/posts/1brc/index.qmd +++ b/docs/posts/1brc/index.qmd @@ -182,7 +182,7 @@ reason, I'm also not providing execution times. Try it out yourself! ::: {.panel-tabset} -## DuckDb +## DuckDB First let's set the backend to DuckDB (redundantly since it's the default) and the `kwargs` dictionary: From ec634bf05dcdc62b7a9e288b6f0479eeec476b58 Mon Sep 17 00:00:00 2001 From: Cody Date: Thu, 18 Jan 2024 20:29:44 -0500 Subject: [PATCH 11/13] add portability tag --- docs/_freeze/posts/1brc/index/execute-results/html.json | 4 ++-- docs/posts/1brc/index.qmd | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/_freeze/posts/1brc/index/execute-results/html.json b/docs/_freeze/posts/1brc/index/execute-results/html.json index aca68e7c5718..fa605313d516 100644 --- a/docs/_freeze/posts/1brc/index/execute-results/html.json +++ b/docs/_freeze/posts/1brc/index/execute-results/html.json @@ -1,8 +1,8 @@ { - "hash": "aadeb5edf1fc2747b18d8a1f77ef29b9", + "hash": "708d488ca91f22708b5c5e94fd03e5da", "result": { "engine": "jupyter", - "markdown": "---\ntitle: \"Using one Python dataframe API to take the billion row challenge with DuckDB, Polars, and DataFusion\"\nauthor: \"Cody\"\ndate: \"2024-01-22\"\ncategories:\n - blog\n - duckdb\n - polars\n - datafusion\n---\n\n## Overview\n\nThis is an implementation of the [The One Billion Row\nChallenge](https://www.morling.dev/blog/one-billion-row-challenge/):\n\n> Let’s kick off 2024 true coder style—​I’m excited to announce the One Billion\n> Row Challenge (1BRC), running from Jan 1 until Jan 31.\n\n> Your mission, should you decide to accept it, is deceptively simple: write a\n> Java program for retrieving temperature measurement values from a text file and\n> calculating the min, mean, and max temperature per weather station. There’s just\n> one caveat: the file has 1,000,000,000 rows!\n\nI haven't written Java since dropping a computer science course my second year\nof college that forced us to do functional programming exclusively in Java.\nHowever, I'll gladly take the challenge in Python using Ibis! In fact, I did\nsomething like this (generating a billion rows with 26 columns of random numbers\nand doing basic aggregations) to test out DuckDB and Polars.\n\nIn this blog, we'll demonstrate how Ibis provides a single Python dataframe API\nto take the billion row challenge with DuckDB, Polars, and DataFusion.\n\n## Setup\n\nWe need to generate the data from the challenge. First, clone the\n[repo](https://github.com/gunnarmorling/1brc):\n\n```{.bash}\ngh repo clone gunnarmorling/1brc\n```\n\nThen change into the Python directory and run the generation script with the\nnumber of rows you want to generate:\n\n```{.bash}\ncd 1brc/src/main/python\npython create_measurements.py 1_000_000_000\n```\n\nThis will generate a file called `measurements.txt` in the `data` directory at\nthe root of the repo. It is 15GB on disk:\n\n```{.bash}\n(venv) cody@voda 1brc % du 1brc/data/*\n 15G 1brc/data/measurements.txt\n808K 1brc/data/weather_stations.csv\n```\n\nAnd consists of one billion rows with two columns separated by a semicolon:\n\n```{.bash}\n(venv) cody@voda 1brc % head 1brc/data/measurements.txt\nKusugal;-67.2\nIpil;-88.6\nSohna;-31.2\nLubuagan;-2.3\nSzentes;29.2\nSylvan Lake;-70.7\nAmbato;-35.2\nBerkine;97.0\nWernau;73.4\nKennewick;-19.9\n```\n\nAlso, you'll need to install Ibis with the three backends we'll use:\n\n```{.bash}\npip install ibis-framework[duckdb,polars,datafusion]\n```\n\n## Understanding Ibis\n\nIbis provides a standard dataframe API decoupled from the execution engine. It\ncompiles Ibis expressions to a form of intermediary representation (often SQL)\nthat can be executed by different backends.\n\nThis allows us to write a single Ibis expression to complete the challenge with\nmany different execution engine backends.\n\n:::{.callout-warning}\nWhile Ibis does its best to abstract away the differences between backends, this\ncannot be done in some areas like data input and output. For example, the\n`read_csv` function across various backends (in their SQL and Python forms) have\ndifferent parameters. We'll handle that with different `kwargs` dictionaries for\nthese backends in this post.\n\nIn general, besides creating a connection and data input/output, the Ibis API is\nthe same across backends.\n:::\n\n## Completing the challenge thrice\n\nWe'll use three great options for local backends -- DuckDB, Polars, and\nDataFusion -- to complete the challenge.\n\n### Setup\n\nBefore we get started, we'll make some imports, turn on interactive mode, and\ndefine the `kwargs` dictionary for the backends corresponding to their\n`read_csv` function:\n\n::: {#dff7ea8d .cell execution_count=1}\n``` {.python .cell-code}\nimport ibis\nimport polars as pl\nimport pyarrow as pa\n\nibis.options.interactive = True\n\nduckdb_kwargs = {\n \"delim\": \";\",\n \"header\": False,\n \"columns\": {\"station\": \"VARCHAR\", \"temperature\": \"DOUBLE\"},\n}\n\npolars_kwargs = {\n \"separator\": \";\",\n \"has_header\": False,\n \"new_columns\": [\"station\", \"temperature\"],\n \"schema\": {\"station\": pl.Utf8, \"temperature\": pl.Float64},\n}\n\ndatafusion_kwargs = {\n \"delimiter\": \";\",\n \"has_header\": False,\n \"schema\": pa.schema(\n [\n (\n \"station\",\n pa.string(),\n ),\n (\n \"temperature\",\n pa.float64(),\n ),\n ]\n ),\n \"file_extension\": \".txt\",\n}\n```\n:::\n\n\nLet's define a function to run the same code with each backend to complete the challenge:\n\n::: {#8257f94c .cell execution_count=2}\n``` {.python .cell-code}\ndef run_challenge(t):\n res = (\n t.group_by(ibis._.station)\n .agg(\n min_temp=ibis._.temperature.min(),\n mean_temp=ibis._.temperature.mean(),\n max_temp=ibis._.temperature.max(),\n )\n .order_by(ibis._.station.desc())\n )\n return res\n```\n:::\n\n\n### Completing the challenge\n\nLet's complete the challenge with each backend.\n\n:::{.callout-note}\nThe results are the same across backends but look suspicious. It is noted in the\nrepository that the Python generation code is \"unofficial\", so may have some\nproblems. Given this is a contrived example of generated data, I'm not going to\nworry about it.\n\nThe point is that we can easily complete the challenge with the same code across\nmany backends, letting them worry about the details of execution. For this\nreason, I'm also not providing execution times. Try it out yourself!\n:::\n\n::: {.panel-tabset}\n\n## DuckDB\n\nFirst let's set the backend to DuckDB (redundantly since it's the default) and\nthe `kwargs` dictionary:\n\n::: {#cea6f439 .cell execution_count=3}\n``` {.python .cell-code}\nibis.set_backend(\"duckdb\") # <1>\nkwargs = duckdb_kwargs\n```\n:::\n\n\n\n\n1. Redundant given DuckDB is the default\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#193a3ce5 .cell execution_count=5}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=5}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#c96829d6 .cell execution_count=6}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=6}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#c951569b .cell execution_count=7}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## Polars\n\nFirst let's set the backend to Polars and the `kwargs` dictionary:\n\n::: {#ee718894 .cell execution_count=8}\n``` {.python .cell-code}\nibis.set_backend(\"polars\") # <1>\nkwargs = polars_kwargs\n```\n:::\n\n\n1. Set Polars as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#c4ee239c .cell execution_count=9}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=9}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#0dc00b81 .cell execution_count=10}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=10}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#8df0469a .cell execution_count=11}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=11}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## DataFusion\n\nFirst let's set the backend to DataFusion and the `kwargs` dictionary:\n\n::: {#d14fb112 .cell execution_count=12}\n``` {.python .cell-code}\nibis.set_backend(\"datafusion\") # <1>\nkwargs = datafusion_kwargs\n```\n:::\n\n\n1. Set DataFusion as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#922b3591 .cell execution_count=13}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=13}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#2b50efa1 .cell execution_count=14}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=14}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#d43becb7 .cell execution_count=15}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=15}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n:::\n\n## Bonus: more billion row data generation\n\nWhile we're here, I'll share the code I've used in the past to generate a\nbillion rows of random data:\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nsql_str = \"\"\nsql_str += \"select\\n\"\nfor c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1))):\n sql_str += f\" random() as {c},\\n\"\nsql_str += f\"from generate_series(1, {ROWS})\"\n\nt = con.sql(sql_str)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nNowadays I'd convert that to an Ibis expression:\n\n:::{.callout-note}\nThis is a slightly different result with a monotonic index column, but I prefer\nit anyway. You could drop that column or adjust the expression.\n:::\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nt = (\n ibis.range(ROWS)\n .unnest()\n .name(\"index\")\n .as_table()\n .mutate(**{c: ibis.random() for c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1)))})\n)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nBut if you do need to construct a programmatic SQL string, it's cool that you\ncan!\n\n## Conclusion\n\nWhile the one billion row challenge isn't a great benchmark, it's a fun way to\ndemonstrate how Ibis provides a single Python dataframe API to take the billion\nrow challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with\nother backends!\n\nHappy coding!\n\n", + "markdown": "---\ntitle: \"Using one Python dataframe API to take the billion row challenge with DuckDB, Polars, and DataFusion\"\nauthor: \"Cody\"\ndate: \"2024-01-22\"\ncategories:\n - blog\n - duckdb\n - polars\n - datafusion\n - portability\n---\n\n## Overview\n\nThis is an implementation of the [The One Billion Row\nChallenge](https://www.morling.dev/blog/one-billion-row-challenge/):\n\n> Let’s kick off 2024 true coder style—​I’m excited to announce the One Billion\n> Row Challenge (1BRC), running from Jan 1 until Jan 31.\n\n> Your mission, should you decide to accept it, is deceptively simple: write a\n> Java program for retrieving temperature measurement values from a text file and\n> calculating the min, mean, and max temperature per weather station. There’s just\n> one caveat: the file has 1,000,000,000 rows!\n\nI haven't written Java since dropping a computer science course my second year\nof college that forced us to do functional programming exclusively in Java.\nHowever, I'll gladly take the challenge in Python using Ibis! In fact, I did\nsomething like this (generating a billion rows with 26 columns of random numbers\nand doing basic aggregations) to test out DuckDB and Polars.\n\nIn this blog, we'll demonstrate how Ibis provides a single Python dataframe API\nto take the billion row challenge with DuckDB, Polars, and DataFusion.\n\n## Setup\n\nWe need to generate the data from the challenge. First, clone the\n[repo](https://github.com/gunnarmorling/1brc):\n\n```{.bash}\ngh repo clone gunnarmorling/1brc\n```\n\nThen change into the Python directory and run the generation script with the\nnumber of rows you want to generate:\n\n```{.bash}\ncd 1brc/src/main/python\npython create_measurements.py 1_000_000_000\n```\n\nThis will generate a file called `measurements.txt` in the `data` directory at\nthe root of the repo. It is 15GB on disk:\n\n```{.bash}\n(venv) cody@voda 1brc % du 1brc/data/*\n 15G 1brc/data/measurements.txt\n808K 1brc/data/weather_stations.csv\n```\n\nAnd consists of one billion rows with two columns separated by a semicolon:\n\n```{.bash}\n(venv) cody@voda 1brc % head 1brc/data/measurements.txt\nKusugal;-67.2\nIpil;-88.6\nSohna;-31.2\nLubuagan;-2.3\nSzentes;29.2\nSylvan Lake;-70.7\nAmbato;-35.2\nBerkine;97.0\nWernau;73.4\nKennewick;-19.9\n```\n\nAlso, you'll need to install Ibis with the three backends we'll use:\n\n```{.bash}\npip install ibis-framework[duckdb,polars,datafusion]\n```\n\n## Understanding Ibis\n\nIbis provides a standard dataframe API decoupled from the execution engine. It\ncompiles Ibis expressions to a form of intermediary representation (often SQL)\nthat can be executed by different backends.\n\nThis allows us to write a single Ibis expression to complete the challenge with\nmany different execution engine backends.\n\n:::{.callout-warning}\nWhile Ibis does its best to abstract away the differences between backends, this\ncannot be done in some areas like data input and output. For example, the\n`read_csv` function across various backends (in their SQL and Python forms) have\ndifferent parameters. We'll handle that with different `kwargs` dictionaries for\nthese backends in this post.\n\nIn general, besides creating a connection and data input/output, the Ibis API is\nthe same across backends.\n:::\n\n## Completing the challenge thrice\n\nWe'll use three great options for local backends -- DuckDB, Polars, and\nDataFusion -- to complete the challenge.\n\n### Setup\n\nBefore we get started, we'll make some imports, turn on interactive mode, and\ndefine the `kwargs` dictionary for the backends corresponding to their\n`read_csv` function:\n\n::: {#346b193e .cell execution_count=1}\n``` {.python .cell-code}\nimport ibis\nimport polars as pl\nimport pyarrow as pa\n\nibis.options.interactive = True\n\nduckdb_kwargs = {\n \"delim\": \";\",\n \"header\": False,\n \"columns\": {\"station\": \"VARCHAR\", \"temperature\": \"DOUBLE\"},\n}\n\npolars_kwargs = {\n \"separator\": \";\",\n \"has_header\": False,\n \"new_columns\": [\"station\", \"temperature\"],\n \"schema\": {\"station\": pl.Utf8, \"temperature\": pl.Float64},\n}\n\ndatafusion_kwargs = {\n \"delimiter\": \";\",\n \"has_header\": False,\n \"schema\": pa.schema(\n [\n (\n \"station\",\n pa.string(),\n ),\n (\n \"temperature\",\n pa.float64(),\n ),\n ]\n ),\n \"file_extension\": \".txt\",\n}\n```\n:::\n\n\nLet's define a function to run the same code with each backend to complete the challenge:\n\n::: {#f8ddfb05 .cell execution_count=2}\n``` {.python .cell-code}\ndef run_challenge(t):\n res = (\n t.group_by(ibis._.station)\n .agg(\n min_temp=ibis._.temperature.min(),\n mean_temp=ibis._.temperature.mean(),\n max_temp=ibis._.temperature.max(),\n )\n .order_by(ibis._.station.desc())\n )\n return res\n```\n:::\n\n\n### Completing the challenge\n\nLet's complete the challenge with each backend.\n\n:::{.callout-note}\nThe results are the same across backends but look suspicious. It is noted in the\nrepository that the Python generation code is \"unofficial\", so may have some\nproblems. Given this is a contrived example of generated data, I'm not going to\nworry about it.\n\nThe point is that we can easily complete the challenge with the same code across\nmany backends, letting them worry about the details of execution. For this\nreason, I'm also not providing execution times. Try it out yourself!\n:::\n\n::: {.panel-tabset}\n\n## DuckDB\n\nFirst let's set the backend to DuckDB (redundantly since it's the default) and\nthe `kwargs` dictionary:\n\n::: {#1a358abe .cell execution_count=3}\n``` {.python .cell-code}\nibis.set_backend(\"duckdb\") # <1>\nkwargs = duckdb_kwargs\n```\n:::\n\n\n\n\n1. Redundant given DuckDB is the default\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#ead5cf27 .cell execution_count=5}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=5}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#9b1bfb39 .cell execution_count=6}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=6}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#69c639dc .cell execution_count=7}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## Polars\n\nFirst let's set the backend to Polars and the `kwargs` dictionary:\n\n::: {#faa766a5 .cell execution_count=8}\n``` {.python .cell-code}\nibis.set_backend(\"polars\") # <1>\nkwargs = polars_kwargs\n```\n:::\n\n\n1. Set Polars as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#8c6bfb71 .cell execution_count=9}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=9}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#623dfb90 .cell execution_count=10}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=10}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#e9a74f6e .cell execution_count=11}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=11}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## DataFusion\n\nFirst let's set the backend to DataFusion and the `kwargs` dictionary:\n\n::: {#bd4f7f44 .cell execution_count=12}\n``` {.python .cell-code}\nibis.set_backend(\"datafusion\") # <1>\nkwargs = datafusion_kwargs\n```\n:::\n\n\n1. Set DataFusion as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#64e7dc60 .cell execution_count=13}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=13}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#5299d343 .cell execution_count=14}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=14}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#b7fd88b7 .cell execution_count=15}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=15}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n:::\n\n## Bonus: more billion row data generation\n\nWhile we're here, I'll share the code I've used in the past to generate a\nbillion rows of random data:\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nsql_str = \"\"\nsql_str += \"select\\n\"\nfor c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1))):\n sql_str += f\" random() as {c},\\n\"\nsql_str += f\"from generate_series(1, {ROWS})\"\n\nt = con.sql(sql_str)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nNowadays I'd convert that to an Ibis expression:\n\n:::{.callout-note}\nThis is a slightly different result with a monotonic index column, but I prefer\nit anyway. You could drop that column or adjust the expression.\n:::\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nt = (\n ibis.range(ROWS)\n .unnest()\n .name(\"index\")\n .as_table()\n .mutate(**{c: ibis.random() for c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1)))})\n)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nBut if you do need to construct a programmatic SQL string, it's cool that you\ncan!\n\n## Conclusion\n\nWhile the one billion row challenge isn't a great benchmark, it's a fun way to\ndemonstrate how Ibis provides a single Python dataframe API to take the billion\nrow challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with\nother backends!\n\nHappy coding!\n\n", "supporting": [ "index_files/figure-html" ], diff --git a/docs/posts/1brc/index.qmd b/docs/posts/1brc/index.qmd index ada9a116f9b8..b527cdd8f88d 100644 --- a/docs/posts/1brc/index.qmd +++ b/docs/posts/1brc/index.qmd @@ -7,6 +7,7 @@ categories: - duckdb - polars - datafusion + - portability --- ## Overview From 64fbf1579c02201f2c42e7a4adc5a3012263e7d5 Mon Sep 17 00:00:00 2001 From: Cody Date: Fri, 19 Jan 2024 12:10:39 -0500 Subject: [PATCH 12/13] swap conclusion and bonus sections --- docs/posts/1brc/index.qmd | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/posts/1brc/index.qmd b/docs/posts/1brc/index.qmd index b527cdd8f88d..5144449e71b9 100644 --- a/docs/posts/1brc/index.qmd +++ b/docs/posts/1brc/index.qmd @@ -285,6 +285,15 @@ res ::: +## Conclusion + +While the one billion row challenge isn't a great benchmark, it's a fun way to +demonstrate how Ibis provides a single Python dataframe API to take the billion +row challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with +other backends! + +Happy coding! + ## Bonus: more billion row data generation While we're here, I'll share the code I've used in the past to generate a @@ -333,12 +342,3 @@ con.create_table("billion", t, overwrite=True) But if you do need to construct a programmatic SQL string, it's cool that you can! - -## Conclusion - -While the one billion row challenge isn't a great benchmark, it's a fun way to -demonstrate how Ibis provides a single Python dataframe API to take the billion -row challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with -other backends! - -Happy coding! From 9a98a5071bc1a7539fa328544537c726ddb3e4b7 Mon Sep 17 00:00:00 2001 From: Cody Date: Fri, 19 Jan 2024 12:31:06 -0500 Subject: [PATCH 13/13] forgot the freeze --- docs/_freeze/posts/1brc/index/execute-results/html.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/_freeze/posts/1brc/index/execute-results/html.json b/docs/_freeze/posts/1brc/index/execute-results/html.json index fa605313d516..8cb7d020f958 100644 --- a/docs/_freeze/posts/1brc/index/execute-results/html.json +++ b/docs/_freeze/posts/1brc/index/execute-results/html.json @@ -1,8 +1,8 @@ { - "hash": "708d488ca91f22708b5c5e94fd03e5da", + "hash": "82db55d1ca02427a0ed68841420637fd", "result": { "engine": "jupyter", - "markdown": "---\ntitle: \"Using one Python dataframe API to take the billion row challenge with DuckDB, Polars, and DataFusion\"\nauthor: \"Cody\"\ndate: \"2024-01-22\"\ncategories:\n - blog\n - duckdb\n - polars\n - datafusion\n - portability\n---\n\n## Overview\n\nThis is an implementation of the [The One Billion Row\nChallenge](https://www.morling.dev/blog/one-billion-row-challenge/):\n\n> Let’s kick off 2024 true coder style—​I’m excited to announce the One Billion\n> Row Challenge (1BRC), running from Jan 1 until Jan 31.\n\n> Your mission, should you decide to accept it, is deceptively simple: write a\n> Java program for retrieving temperature measurement values from a text file and\n> calculating the min, mean, and max temperature per weather station. There’s just\n> one caveat: the file has 1,000,000,000 rows!\n\nI haven't written Java since dropping a computer science course my second year\nof college that forced us to do functional programming exclusively in Java.\nHowever, I'll gladly take the challenge in Python using Ibis! In fact, I did\nsomething like this (generating a billion rows with 26 columns of random numbers\nand doing basic aggregations) to test out DuckDB and Polars.\n\nIn this blog, we'll demonstrate how Ibis provides a single Python dataframe API\nto take the billion row challenge with DuckDB, Polars, and DataFusion.\n\n## Setup\n\nWe need to generate the data from the challenge. First, clone the\n[repo](https://github.com/gunnarmorling/1brc):\n\n```{.bash}\ngh repo clone gunnarmorling/1brc\n```\n\nThen change into the Python directory and run the generation script with the\nnumber of rows you want to generate:\n\n```{.bash}\ncd 1brc/src/main/python\npython create_measurements.py 1_000_000_000\n```\n\nThis will generate a file called `measurements.txt` in the `data` directory at\nthe root of the repo. It is 15GB on disk:\n\n```{.bash}\n(venv) cody@voda 1brc % du 1brc/data/*\n 15G 1brc/data/measurements.txt\n808K 1brc/data/weather_stations.csv\n```\n\nAnd consists of one billion rows with two columns separated by a semicolon:\n\n```{.bash}\n(venv) cody@voda 1brc % head 1brc/data/measurements.txt\nKusugal;-67.2\nIpil;-88.6\nSohna;-31.2\nLubuagan;-2.3\nSzentes;29.2\nSylvan Lake;-70.7\nAmbato;-35.2\nBerkine;97.0\nWernau;73.4\nKennewick;-19.9\n```\n\nAlso, you'll need to install Ibis with the three backends we'll use:\n\n```{.bash}\npip install ibis-framework[duckdb,polars,datafusion]\n```\n\n## Understanding Ibis\n\nIbis provides a standard dataframe API decoupled from the execution engine. It\ncompiles Ibis expressions to a form of intermediary representation (often SQL)\nthat can be executed by different backends.\n\nThis allows us to write a single Ibis expression to complete the challenge with\nmany different execution engine backends.\n\n:::{.callout-warning}\nWhile Ibis does its best to abstract away the differences between backends, this\ncannot be done in some areas like data input and output. For example, the\n`read_csv` function across various backends (in their SQL and Python forms) have\ndifferent parameters. We'll handle that with different `kwargs` dictionaries for\nthese backends in this post.\n\nIn general, besides creating a connection and data input/output, the Ibis API is\nthe same across backends.\n:::\n\n## Completing the challenge thrice\n\nWe'll use three great options for local backends -- DuckDB, Polars, and\nDataFusion -- to complete the challenge.\n\n### Setup\n\nBefore we get started, we'll make some imports, turn on interactive mode, and\ndefine the `kwargs` dictionary for the backends corresponding to their\n`read_csv` function:\n\n::: {#346b193e .cell execution_count=1}\n``` {.python .cell-code}\nimport ibis\nimport polars as pl\nimport pyarrow as pa\n\nibis.options.interactive = True\n\nduckdb_kwargs = {\n \"delim\": \";\",\n \"header\": False,\n \"columns\": {\"station\": \"VARCHAR\", \"temperature\": \"DOUBLE\"},\n}\n\npolars_kwargs = {\n \"separator\": \";\",\n \"has_header\": False,\n \"new_columns\": [\"station\", \"temperature\"],\n \"schema\": {\"station\": pl.Utf8, \"temperature\": pl.Float64},\n}\n\ndatafusion_kwargs = {\n \"delimiter\": \";\",\n \"has_header\": False,\n \"schema\": pa.schema(\n [\n (\n \"station\",\n pa.string(),\n ),\n (\n \"temperature\",\n pa.float64(),\n ),\n ]\n ),\n \"file_extension\": \".txt\",\n}\n```\n:::\n\n\nLet's define a function to run the same code with each backend to complete the challenge:\n\n::: {#f8ddfb05 .cell execution_count=2}\n``` {.python .cell-code}\ndef run_challenge(t):\n res = (\n t.group_by(ibis._.station)\n .agg(\n min_temp=ibis._.temperature.min(),\n mean_temp=ibis._.temperature.mean(),\n max_temp=ibis._.temperature.max(),\n )\n .order_by(ibis._.station.desc())\n )\n return res\n```\n:::\n\n\n### Completing the challenge\n\nLet's complete the challenge with each backend.\n\n:::{.callout-note}\nThe results are the same across backends but look suspicious. It is noted in the\nrepository that the Python generation code is \"unofficial\", so may have some\nproblems. Given this is a contrived example of generated data, I'm not going to\nworry about it.\n\nThe point is that we can easily complete the challenge with the same code across\nmany backends, letting them worry about the details of execution. For this\nreason, I'm also not providing execution times. Try it out yourself!\n:::\n\n::: {.panel-tabset}\n\n## DuckDB\n\nFirst let's set the backend to DuckDB (redundantly since it's the default) and\nthe `kwargs` dictionary:\n\n::: {#1a358abe .cell execution_count=3}\n``` {.python .cell-code}\nibis.set_backend(\"duckdb\") # <1>\nkwargs = duckdb_kwargs\n```\n:::\n\n\n\n\n1. Redundant given DuckDB is the default\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#ead5cf27 .cell execution_count=5}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=5}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#9b1bfb39 .cell execution_count=6}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=6}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#69c639dc .cell execution_count=7}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## Polars\n\nFirst let's set the backend to Polars and the `kwargs` dictionary:\n\n::: {#faa766a5 .cell execution_count=8}\n``` {.python .cell-code}\nibis.set_backend(\"polars\") # <1>\nkwargs = polars_kwargs\n```\n:::\n\n\n1. Set Polars as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#8c6bfb71 .cell execution_count=9}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=9}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#623dfb90 .cell execution_count=10}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=10}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#e9a74f6e .cell execution_count=11}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=11}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## DataFusion\n\nFirst let's set the backend to DataFusion and the `kwargs` dictionary:\n\n::: {#bd4f7f44 .cell execution_count=12}\n``` {.python .cell-code}\nibis.set_backend(\"datafusion\") # <1>\nkwargs = datafusion_kwargs\n```\n:::\n\n\n1. Set DataFusion as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#64e7dc60 .cell execution_count=13}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=13}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#5299d343 .cell execution_count=14}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=14}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#b7fd88b7 .cell execution_count=15}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=15}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n:::\n\n## Bonus: more billion row data generation\n\nWhile we're here, I'll share the code I've used in the past to generate a\nbillion rows of random data:\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nsql_str = \"\"\nsql_str += \"select\\n\"\nfor c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1))):\n sql_str += f\" random() as {c},\\n\"\nsql_str += f\"from generate_series(1, {ROWS})\"\n\nt = con.sql(sql_str)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nNowadays I'd convert that to an Ibis expression:\n\n:::{.callout-note}\nThis is a slightly different result with a monotonic index column, but I prefer\nit anyway. You could drop that column or adjust the expression.\n:::\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nt = (\n ibis.range(ROWS)\n .unnest()\n .name(\"index\")\n .as_table()\n .mutate(**{c: ibis.random() for c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1)))})\n)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nBut if you do need to construct a programmatic SQL string, it's cool that you\ncan!\n\n## Conclusion\n\nWhile the one billion row challenge isn't a great benchmark, it's a fun way to\ndemonstrate how Ibis provides a single Python dataframe API to take the billion\nrow challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with\nother backends!\n\nHappy coding!\n\n", + "markdown": "---\ntitle: \"Using one Python dataframe API to take the billion row challenge with DuckDB, Polars, and DataFusion\"\nauthor: \"Cody\"\ndate: \"2024-01-22\"\ncategories:\n - blog\n - duckdb\n - polars\n - datafusion\n - portability\n---\n\n## Overview\n\nThis is an implementation of the [The One Billion Row\nChallenge](https://www.morling.dev/blog/one-billion-row-challenge/):\n\n> Let’s kick off 2024 true coder style—​I’m excited to announce the One Billion\n> Row Challenge (1BRC), running from Jan 1 until Jan 31.\n\n> Your mission, should you decide to accept it, is deceptively simple: write a\n> Java program for retrieving temperature measurement values from a text file and\n> calculating the min, mean, and max temperature per weather station. There’s just\n> one caveat: the file has 1,000,000,000 rows!\n\nI haven't written Java since dropping a computer science course my second year\nof college that forced us to do functional programming exclusively in Java.\nHowever, I'll gladly take the challenge in Python using Ibis! In fact, I did\nsomething like this (generating a billion rows with 26 columns of random numbers\nand doing basic aggregations) to test out DuckDB and Polars.\n\nIn this blog, we'll demonstrate how Ibis provides a single Python dataframe API\nto take the billion row challenge with DuckDB, Polars, and DataFusion.\n\n## Setup\n\nWe need to generate the data from the challenge. First, clone the\n[repo](https://github.com/gunnarmorling/1brc):\n\n```{.bash}\ngh repo clone gunnarmorling/1brc\n```\n\nThen change into the Python directory and run the generation script with the\nnumber of rows you want to generate:\n\n```{.bash}\ncd 1brc/src/main/python\npython create_measurements.py 1_000_000_000\n```\n\nThis will generate a file called `measurements.txt` in the `data` directory at\nthe root of the repo. It is 15GB on disk:\n\n```{.bash}\n(venv) cody@voda 1brc % du 1brc/data/*\n 15G 1brc/data/measurements.txt\n808K 1brc/data/weather_stations.csv\n```\n\nAnd consists of one billion rows with two columns separated by a semicolon:\n\n```{.bash}\n(venv) cody@voda 1brc % head 1brc/data/measurements.txt\nKusugal;-67.2\nIpil;-88.6\nSohna;-31.2\nLubuagan;-2.3\nSzentes;29.2\nSylvan Lake;-70.7\nAmbato;-35.2\nBerkine;97.0\nWernau;73.4\nKennewick;-19.9\n```\n\nAlso, you'll need to install Ibis with the three backends we'll use:\n\n```{.bash}\npip install ibis-framework[duckdb,polars,datafusion]\n```\n\n## Understanding Ibis\n\nIbis provides a standard dataframe API decoupled from the execution engine. It\ncompiles Ibis expressions to a form of intermediary representation (often SQL)\nthat can be executed by different backends.\n\nThis allows us to write a single Ibis expression to complete the challenge with\nmany different execution engine backends.\n\n:::{.callout-warning}\nWhile Ibis does its best to abstract away the differences between backends, this\ncannot be done in some areas like data input and output. For example, the\n`read_csv` function across various backends (in their SQL and Python forms) have\ndifferent parameters. We'll handle that with different `kwargs` dictionaries for\nthese backends in this post.\n\nIn general, besides creating a connection and data input/output, the Ibis API is\nthe same across backends.\n:::\n\n## Completing the challenge thrice\n\nWe'll use three great options for local backends -- DuckDB, Polars, and\nDataFusion -- to complete the challenge.\n\n### Setup\n\nBefore we get started, we'll make some imports, turn on interactive mode, and\ndefine the `kwargs` dictionary for the backends corresponding to their\n`read_csv` function:\n\n::: {#b66caea0 .cell execution_count=1}\n``` {.python .cell-code}\nimport ibis\nimport polars as pl\nimport pyarrow as pa\n\nibis.options.interactive = True\n\nduckdb_kwargs = {\n \"delim\": \";\",\n \"header\": False,\n \"columns\": {\"station\": \"VARCHAR\", \"temperature\": \"DOUBLE\"},\n}\n\npolars_kwargs = {\n \"separator\": \";\",\n \"has_header\": False,\n \"new_columns\": [\"station\", \"temperature\"],\n \"schema\": {\"station\": pl.Utf8, \"temperature\": pl.Float64},\n}\n\ndatafusion_kwargs = {\n \"delimiter\": \";\",\n \"has_header\": False,\n \"schema\": pa.schema(\n [\n (\n \"station\",\n pa.string(),\n ),\n (\n \"temperature\",\n pa.float64(),\n ),\n ]\n ),\n \"file_extension\": \".txt\",\n}\n```\n:::\n\n\nLet's define a function to run the same code with each backend to complete the challenge:\n\n::: {#8d7a8f3b .cell execution_count=2}\n``` {.python .cell-code}\ndef run_challenge(t):\n res = (\n t.group_by(ibis._.station)\n .agg(\n min_temp=ibis._.temperature.min(),\n mean_temp=ibis._.temperature.mean(),\n max_temp=ibis._.temperature.max(),\n )\n .order_by(ibis._.station.desc())\n )\n return res\n```\n:::\n\n\n### Completing the challenge\n\nLet's complete the challenge with each backend.\n\n:::{.callout-note}\nThe results are the same across backends but look suspicious. It is noted in the\nrepository that the Python generation code is \"unofficial\", so may have some\nproblems. Given this is a contrived example of generated data, I'm not going to\nworry about it.\n\nThe point is that we can easily complete the challenge with the same code across\nmany backends, letting them worry about the details of execution. For this\nreason, I'm also not providing execution times. Try it out yourself!\n:::\n\n::: {.panel-tabset}\n\n## DuckDB\n\nFirst let's set the backend to DuckDB (redundantly since it's the default) and\nthe `kwargs` dictionary:\n\n::: {#a63ac8cc .cell execution_count=3}\n``` {.python .cell-code}\nibis.set_backend(\"duckdb\") # <1>\nkwargs = duckdb_kwargs\n```\n:::\n\n\n\n\n1. Redundant given DuckDB is the default\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#73800ddb .cell execution_count=5}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=5}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#cfe6bf62 .cell execution_count=6}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=6}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#b01a5986 .cell execution_count=7}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## Polars\n\nFirst let's set the backend to Polars and the `kwargs` dictionary:\n\n::: {#bf41374d .cell execution_count=8}\n``` {.python .cell-code}\nibis.set_backend(\"polars\") # <1>\nkwargs = polars_kwargs\n```\n:::\n\n\n1. Set Polars as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#cac9f1fe .cell execution_count=9}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=9}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#87cd1a0a .cell execution_count=10}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=10}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#778990e7 .cell execution_count=11}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=11}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n## DataFusion\n\nFirst let's set the backend to DataFusion and the `kwargs` dictionary:\n\n::: {#1a714b65 .cell execution_count=12}\n``` {.python .cell-code}\nibis.set_backend(\"datafusion\") # <1>\nkwargs = datafusion_kwargs\n```\n:::\n\n\n1. Set DataFusion as the default backend used\n\nNext, we'll read in the data and take a look at the table:\n\n::: {#232867fd .cell execution_count=13}\n``` {.python .cell-code}\nt = ibis.read_csv(\"1brc/data/measurements.txt\", **kwargs)\nt.limit(3)\n```\n\n::: {.cell-output .cell-output-display execution_count=13}\n```{=html}\n
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n┃ station      temperature ┃\n┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n│ stringfloat64     │\n├─────────────┼─────────────┤\n│ Lívingston -21.0 │\n│ Annūr      -33.4 │\n│ Beni Douala16.5 │\n└─────────────┴─────────────┘\n
\n```\n:::\n:::\n\n\nThen let's confirm it's **a billion** rows:\n\n::: {#bb715c5f .cell execution_count=14}\n``` {.python .cell-code}\nf\"{t.count().to_pandas():,}\"\n```\n\n::: {.cell-output .cell-output-display execution_count=14}\n```\n'1,000,000,000'\n```\n:::\n:::\n\n\nFinally, we'll compute the min, mean, and max temperature per weather station:\n\n::: {#fedee251 .cell execution_count=15}\n``` {.python .cell-code}\nres = run_challenge(t)\nres\n```\n\n::: {.cell-output .cell-output-display execution_count=15}\n```{=html}\n
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓\n┃ station         min_temp  mean_temp  max_temp ┃\n┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩\n│ stringfloat64float64float64  │\n├────────────────┼──────────┼───────────┼──────────┤\n│ ’s-Gravendeel -99.90.11218899.9 │\n│ ’Aïn el Hammam-99.9-0.22528999.9 │\n│ ’Aïn Roua     -99.9-0.19824199.9 │\n│ ‘Ibrī         -99.90.00949999.9 │\n│ ‘Ayn al ‘Arab -99.90.12473099.9 │\n│ ‘Akko         -99.9-0.08718499.9 │\n│ ‘Afrīn        -99.9-0.01332299.9 │\n│ Ấp Tân Ngãi   -99.90.34408999.9 │\n│ Ẕefat         -99.90.01776799.9 │\n│ Ḩīsh          -99.90.01880499.9 │\n│  │\n└────────────────┴──────────┴───────────┴──────────┘\n
\n```\n:::\n:::\n\n\n:::\n\n## Conclusion\n\nWhile the one billion row challenge isn't a great benchmark, it's a fun way to\ndemonstrate how Ibis provides a single Python dataframe API to take the billion\nrow challenge with DuckDB, Polars, and DataFusion. Feel free to try it out with\nother backends!\n\nHappy coding!\n\n## Bonus: more billion row data generation\n\nWhile we're here, I'll share the code I've used in the past to generate a\nbillion rows of random data:\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nsql_str = \"\"\nsql_str += \"select\\n\"\nfor c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1))):\n sql_str += f\" random() as {c},\\n\"\nsql_str += f\"from generate_series(1, {ROWS})\"\n\nt = con.sql(sql_str)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nNowadays I'd convert that to an Ibis expression:\n\n:::{.callout-note}\nThis is a slightly different result with a monotonic index column, but I prefer\nit anyway. You could drop that column or adjust the expression.\n:::\n\n```{.python}\nimport ibis\n\ncon = ibis.connect(\"duckdb://data.ddb\")\n\nROWS = 1_000_000_000\n\nt = (\n ibis.range(ROWS)\n .unnest()\n .name(\"index\")\n .as_table()\n .mutate(**{c: ibis.random() for c in list(map(chr, range(ord(\"a\"), ord(\"z\") + 1)))})\n)\ncon.create_table(\"billion\", t, overwrite=True)\n```\n\nBut if you do need to construct a programmatic SQL string, it's cool that you\ncan!\n\n", "supporting": [ "index_files/figure-html" ],