Skip to content

Commit

Permalink
docs(blog-post): add blog post comparing ibis to pandas and dask
Browse files Browse the repository at this point in the history
  • Loading branch information
cpcloud committed Dec 6, 2023
1 parent ed47c74 commit a7fd32b
Show file tree
Hide file tree
Showing 11 changed files with 1,095 additions and 14 deletions.

Large diffs are not rendered by default.

39 changes: 39 additions & 0 deletions docs/posts/pydata-performance/dask_impl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from __future__ import annotations

import logging

import dask.dataframe as dd
from dask.distributed import Client

if __name__ == "__main__":
client = Client(silence_logs=logging.ERROR)
df = dd.read_parquet(
"/data/pypi-parquet/*.parquet",
columns=["path", "uploaded_on", "project_name"],
split_row_groups=True,
)
df = df[
df.path.str.contains(
r"\.(?:asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
)
& ~df.path.str.contains(r"(?:^|/)test(?:|s|ing)")
& ~df.path.str.contains("/site-packages/")
]
print(
df.assign(
month=df.uploaded_on.dt.to_period("M").dt.to_timestamp(),
ext=df.path.str.extract(r"\.([a-z0-9]+)$", 0, expand=False)
.str.replace(r"cxx|cpp|cc|c|hpp|h", "C/C++", regex=True)
.str.replace("^f.*$", "Fortran", regex=True)
.str.replace("rs", "Rust")
.str.replace("go", "Go")
.str.replace("asm", "Assembly"),
)
.groupby(["month", "ext"])
.project_name.nunique()
.rename("project_count")
.compute()
.reset_index()
.sort_values(["month", "project_count"], ascending=False)
)
client.shutdown()
Loading

0 comments on commit a7fd32b

Please sign in to comment.