Skip to content

Commit

Permalink
docs(blog-post): pydata performance part 2; polars and datafusion
Browse files Browse the repository at this point in the history
  • Loading branch information
cpcloud committed Dec 12, 2023
1 parent a3c1c07 commit 36e1db5
Show file tree
Hide file tree
Showing 10 changed files with 492 additions and 9 deletions.

Large diffs are not rendered by default.

33 changes: 33 additions & 0 deletions docs/posts/pydata-performance-part2/datafusion_ibis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from __future__ import annotations

import ibis
from ibis import _

ibis.set_backend("datafusion")

expr = (
ibis.read_parquet("/data/pypi-parquet/*.parquet")
.filter(
[
_.path.re_search(
r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
),
~_.path.re_search(r"(^|/)test(|s|ing)"),
~_.path.contains("/site-packages/"),
]
)
.group_by(
month=_.uploaded_on.truncate("M"),
ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1)
.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++")
.re_replace("^f.*$", "Fortran")
.replace("rs", "Rust")
.replace("go", "Go")
.replace("asm", "Assembly")
.nullif(""),
)
.aggregate(project_count=_.project_name.nunique())
.dropna("ext")
.order_by([_.month.desc(), _.project_count.desc()])
)
df = expr.to_pandas()
12 changes: 12 additions & 0 deletions docs/posts/pydata-performance-part2/datafusion_native.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from __future__ import annotations

import datafusion

with open("./datafusion_native.sql") as f:
query = f.read()

ctx = datafusion.SessionContext()
ctx.register_parquet(name="pypi", path="/data/pypi-parquet/*.parquet")
expr = ctx.sql(query)

df = expr.to_pandas()
47 changes: 47 additions & 0 deletions docs/posts/pydata-performance-part2/datafusion_native.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
SELECT
month,
ext,
COUNT(DISTINCT project_name) AS project_count
FROM (
SELECT
project_name,
DATE_TRUNC('month', uploaded_on) AS month,
NULLIF(
REPLACE(
REPLACE(
REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_MATCH(path, CONCAT('(', '\.([a-z0-9]+)$', ')'))[2],
'cxx|cpp|cc|c|hpp|h',
'C/C++',
'g'
),
'^f.*$',
'Fortran',
'g'
),
'rs',
'Rust'
),
'go',
'Go'
),
'asm',
'Assembly'
),
''
) AS ext
FROM pypi
WHERE COALESCE(
ARRAY_LENGTH(
REGEXP_MATCH(path, '\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$')
) > 0,
FALSE
)
AND NOT COALESCE(ARRAY_LENGTH(REGEXP_MATCH(path, '(^|/)test(|s|ing)')) > 0, FALSE)
AND NOT STRPOS(path, '/site-packages/') > 0
)
WHERE ext IS NOT NULL
GROUP BY month, ext
ORDER BY month DESC, project_count DESC
31 changes: 31 additions & 0 deletions docs/posts/pydata-performance-part2/duckdb_ibis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from __future__ import annotations

import ibis
from ibis import _

expr = (
ibis.read_parquet("/data/pypi-parquet/*.parquet")
.filter(
[
_.path.re_search(
r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
),
~_.path.re_search(r"(^|/)test(|s|ing)"),
~_.path.contains("/site-packages/"),
]
)
.group_by(
month=_.uploaded_on.truncate("M"),
ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1)
.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++")
.re_replace("^f.*$", "Fortran")
.replace("rs", "Rust")
.replace("go", "Go")
.replace("asm", "Assembly")
.nullif(""),
)
.aggregate(project_count=_.project_name.nunique())
.dropna("ext")
.order_by([_.month.desc(), _.project_count.desc()]) # <1>
)
df = expr.to_pandas()
Loading

0 comments on commit 36e1db5

Please sign in to comment.