diff --git a/examples/README.md b/examples/README.md index 25ee6b4eb..3c0c3e2d1 100644 --- a/examples/README.md +++ b/examples/README.md @@ -65,6 +65,7 @@ - [`loader-parquet`](https://observablehq.observablehq.cloud/framework-example-loader-parquet/) - Generat Apache Parquet files - [`loader-postgres`](https://observablehq.observablehq.cloud/framework-example-loader-postgres/) - Load data from PostgreSQL - [`loader-python-to-csv`](https://observablehq.observablehq.cloud/framework-example-loader-python-to-csv/) - Generate CSV from Python +- [`loader-python-to-parquet`](https://observablehq.observablehq.cloud/framework-example-loader-python-to-parquet) - Generate Apache Parquet from Python - [`loader-python-to-png`](https://observablehq.observablehq.cloud/framework-example-loader-python-to-png/) - Generate PNG from Python - [`loader-python-to-zip`](https://observablehq.observablehq.cloud/framework-example-loader-python-to-zip/) - Generate ZIP from Python - [`loader-r-to-csv`](https://observablehq.observablehq.cloud/framework-example-loader-r-to-csv/) - Generate CSV from R diff --git a/examples/loader-python-to-parquet/.gitignore b/examples/loader-python-to-parquet/.gitignore new file mode 100644 index 000000000..0922a3170 --- /dev/null +++ b/examples/loader-python-to-parquet/.gitignore @@ -0,0 +1,5 @@ +.DS_Store +/dist/ +node_modules/ +yarn-error.log +.venv diff --git a/examples/loader-python-to-parquet/README.md b/examples/loader-python-to-parquet/README.md new file mode 100644 index 000000000..3350bc74f --- /dev/null +++ b/examples/loader-python-to-parquet/README.md @@ -0,0 +1,7 @@ +[Framework examples →](../) + +# Python data loader to generate Apache Parquet + +View live: + +This Observable Framework example demonstrates how to write a Python data loader that outputs an Apache Parquet file using the [pyarrow](https://pypi.org/project/pyarrow/) library. The loader reads in a CSV with records for over 91,000 dams in the United States from the [National Inventory of Dams](https://nid.sec.usace.army.mil/), selects several columns, then writes the data frame as a parquet file to standard output. The data loader lives in [`src/data/us-dams.parquet.py`](./src/data/us-dams.parquet.py). diff --git a/examples/loader-python-to-parquet/observablehq.config.js b/examples/loader-python-to-parquet/observablehq.config.js new file mode 100644 index 000000000..fb0f92431 --- /dev/null +++ b/examples/loader-python-to-parquet/observablehq.config.js @@ -0,0 +1,3 @@ +export default { + root: "src" +}; diff --git a/examples/loader-python-to-parquet/package.json b/examples/loader-python-to-parquet/package.json new file mode 100644 index 000000000..fecd37ee8 --- /dev/null +++ b/examples/loader-python-to-parquet/package.json @@ -0,0 +1,20 @@ +{ + "type": "module", + "private": true, + "scripts": { + "clean": "rimraf src/.observablehq/cache", + "build": "rimraf dist && observable build", + "dev": "observable preview", + "deploy": "observable deploy", + "observable": "observable" + }, + "dependencies": { + "@observablehq/framework": "^1.7.0" + }, + "devDependencies": { + "rimraf": "^5.0.5" + }, + "engines": { + "node": ">=18" + } +} diff --git a/examples/loader-python-to-parquet/requirements.txt b/examples/loader-python-to-parquet/requirements.txt new file mode 100644 index 000000000..d95fadc55 --- /dev/null +++ b/examples/loader-python-to-parquet/requirements.txt @@ -0,0 +1,2 @@ +pandas==2.2.0 +pyarrow==16.1 diff --git a/examples/loader-python-to-parquet/src/.gitignore b/examples/loader-python-to-parquet/src/.gitignore new file mode 100644 index 000000000..1235d15eb --- /dev/null +++ b/examples/loader-python-to-parquet/src/.gitignore @@ -0,0 +1 @@ +/.observablehq/cache/ diff --git a/examples/loader-python-to-parquet/src/data/us-dams.parquet.py b/examples/loader-python-to-parquet/src/data/us-dams.parquet.py new file mode 100644 index 000000000..47cc474e6 --- /dev/null +++ b/examples/loader-python-to-parquet/src/data/us-dams.parquet.py @@ -0,0 +1,18 @@ +# Load libraries (must be installed in environment) +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +import sys + +df = pd.read_csv("https://nid.sec.usace.army.mil/api/nation/csv", low_memory=False, skiprows=1).loc[:, ["Dam Name", "Primary Purpose", "Primary Dam Type", "Hazard Potential Classification"]] + +# Write DataFrame to a temporary file-like object +buf = pa.BufferOutputStream() +table = pa.Table.from_pandas(df) +pq.write_table(table, buf, compression="snappy") + +# Get the buffer as a bytes object +buf_bytes = buf.getvalue().to_pybytes() + +# Write the bytes to standard output +sys.stdout.buffer.write(buf_bytes) diff --git a/examples/loader-python-to-parquet/src/index.md b/examples/loader-python-to-parquet/src/index.md new file mode 100644 index 000000000..e433bf9ec --- /dev/null +++ b/examples/loader-python-to-parquet/src/index.md @@ -0,0 +1,71 @@ +# Python data loader to generate Apache Parquet + +Here’s a Python data loader that accesses records for over 91,000 dams from the [National Inventory of Dams](https://nid.sec.usace.army.mil/), limits the data to only four columns, then outputs an Apache Parquet file to standard out. + +```python +# Load libraries (must be installed in environment) +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +import sys + +df = pd.read_csv("https://nid.sec.usace.army.mil/api/nation/csv", low_memory=False, skiprows=1).loc[:, ["Dam Name", "Primary Purpose", "Primary Dam Type", "Hazard Potential Classification"]] + +# Write DataFrame to a temporary file-like object +buf = pa.BufferOutputStream() +table = pa.Table.from_pandas(df) +pq.write_table(table, buf, compression="snappy") + +# Get the buffer as a bytes object +buf_bytes = buf.getvalue().to_pybytes() + +# Write the bytes to standard output +sys.stdout.buffer.write(buf_bytes) +``` + +
+ +To run this data loader you’ll need python3, and the `pandas` and `pyarrow` libraries installed and available on your `$PATH`. + +
+ +
+ +We recommend using a [Python virtual environment](https://observablehq.com/framework/loaders#venv), such as with venv or uv, and managing required packages via `requirements.txt` rather than installing them globally. + +
+ +This example uses the default Snappy compression algorithm. See other [options for compression](https://parquet.apache.org/docs/file-format/data-pages/compression/) available in pyarrow’s [`write_table()`](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html) function. + +The above data loader lives in `data/us-dams.parquet.py`, so we can load the data using `data/us-dams.parquet`. The `FileAttachment.parquet` method parses the file and returns a promise to an Apache Arrow table. + +```js echo +const dams = FileAttachment("data/us-dams.parquet").parquet(); +``` + +We can display the table using `Inputs.table`. + +```js echo +Inputs.table(dams) +``` + +Lastly, we can pass the table to Observable Plot to make a simple bar chart of dam counts by purpose, with color mapped to hazard classification. + +```js echo + Plot.plot({ + marginLeft: 220, + color: {legend: true, domain: ["Undetermined", "Low", "Significant", "High"]}, + marks: [ + Plot.barX(dams, + Plot.groupY( + {x: "count"}, + { + y: "Primary Purpose", + fill: "Hazard Potential Classification", + sort: {y: "x", reverse: true} + } + ) + ) + ] + }) +```