Skip to content

Commit

Permalink
refactor(examples): replace pooch with lighter weight pins
Browse files Browse the repository at this point in the history
  • Loading branch information
cpcloud committed Sep 6, 2023
1 parent 8dabefd commit 521669c
Show file tree
Hide file tree
Showing 13 changed files with 1,193 additions and 1,510 deletions.
7 changes: 3 additions & 4 deletions ibis/backends/tests/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,14 @@
import pytest

import ibis
from ibis.conftest import LINUX, SANDBOXED
from ibis.conftest import LINUX, MACOS, SANDBOXED

pytestmark = pytest.mark.examples


@pytest.mark.xfail(
LINUX and SANDBOXED,
@pytest.mark.skipif(
(LINUX or MACOS) and SANDBOXED,
reason="nix on linux cannot download duckdb extensions or data due to sandboxing",
raises=OSError,
)
@pytest.mark.notimpl(["dask", "datafusion", "pyspark"])
@pytest.mark.notyet(["bigquery", "clickhouse", "druid", "impala", "mssql", "trino"])
Expand Down
79 changes: 36 additions & 43 deletions ibis/examples/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
from __future__ import annotations

import functools
import json
import os
from typing import TYPE_CHECKING, Optional

import filelock

import ibis
from ibis.common.grounds import Concrete

Expand All @@ -18,38 +16,33 @@
import ibis.expr.types as ir
from ibis.backends.base import BaseBackend

_EXAMPLES = None

_METADATA = json.loads(resources.files(__name__).joinpath("metadata.json").read_text())

# These backends load the data directly using `read_csv`/`read_parquet`. All
# other backends load the data using pyarrow, then passing it off to
# `create_table`.
_DIRECT_BACKENDS = frozenset({"duckdb", "polars"})


class Example(Concrete):
descr: Optional[str]
key: str
name: str
help: Optional[str]

def fetch(
self,
*,
table_name: str | None = None,
backend: BaseBackend | None = None,
) -> ir.Table:
key = self.key
# lock to ensure we don't clobber the file if fetched in another
# process
_EXAMPLES.abspath.mkdir(parents=True, exist_ok=True)
with filelock.FileLock(_EXAMPLES.abspath / f"{key}.lock"):
path = _EXAMPLES.fetch(key, progressbar=True)

if backend is None:
backend = ibis.get_backend()

name = self.name

if table_name is None:
table_name = ibis.util.gen_name(f"examples_{type(self).__name__}")
table_name = ibis.util.gen_name(f"examples_{name}")

board = _get_board()

(path,) = board.pin_download(name)

if backend.name in _DIRECT_BACKENDS:
# Read directly into these backends. This helps reduce memory
Expand Down Expand Up @@ -111,40 +104,40 @@ def fetch(
>>> t = ibis.examples.{name}.fetch()
"""

_BUCKET = "ibis-pins"

def __dir__() -> list[str]:
return sorted(_METADATA.keys())

@functools.cache
def _get_metadata():
return json.loads(resources.files(__name__).joinpath("metadata.json").read_text())

def __getattr__(name: str) -> Example:
global _EXAMPLES # noqa: PLW0603

if _EXAMPLES is None:
import pooch
@functools.cache
def _get_board():
import pins

return pins.board_gcs(_BUCKET)

_EXAMPLES = pooch.create(
path=pooch.os_cache("ibis-framework"),
# the trailing slash matters here
base_url="https://storage.googleapis.com/ibis-examples/data/",
version=ibis.__version__,
env="IBIS_EXAMPLES_DATA",
)
with resources.files(__name__).joinpath("registry.txt").open(mode="r") as _f:
_EXAMPLES.load_registry(_f)

spec = _METADATA.get(name, {})
@functools.cache
def __dir__() -> list[str]:
return sorted(_get_metadata().keys())


if (key := spec.get("key")) is None:
raise AttributeError(name)
def __getattr__(name: str) -> Example:
try:
meta = _get_metadata()

description = spec.get("description")
description = meta[name].get("description")

_, ext = key.split(os.extsep, maxsplit=1)
fields = {"__doc__": description} if description is not None else {}

fields = {"__doc__": description} if description is not None else {}
example_class = type(name, (Example,), fields)
example_class.fetch.__doc__ = _FETCH_DOCSTRING_TEMPLATE.format(name=name)

example_class = type(name, (Example,), fields)
example_class.fetch.__doc__ = _FETCH_DOCSTRING_TEMPLATE.format(name=name)
example = example_class(descr=description, key=key)
setattr(ibis.examples, name, example)
return example
example = example_class(name=name, help=description)
setattr(ibis.examples, name, example)
except Exception as e: # noqa: BLE001
raise AttributeError(name) from e
else:
return example
38 changes: 24 additions & 14 deletions ibis/examples/gen_examples.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,26 @@ library(stringr)
library(palmerpenguins)
library(janitor)

lookup <- list("penguins_raw (penguins)" = "penguins_raw")
ignored <- c("sim1", "sim2", "sim3", "sim4", "table1", "table2", "table3", "table4a", "table4b", "table5", "Animals", "Oats", "Muscle", "Melanoma")
LOOKUP <- list("penguins_raw (penguins)" = "penguins_raw")
IGNORED <- c("sim1", "sim2", "sim3", "sim4", "table1", "table2", "table3", "table4a", "table4b", "table5", "Animals", "Oats", "Muscle", "Melanoma")
DESCRIPTIONS_PATH <- "ibis/examples/descriptions"
DATA_PATH <- "ibis/examples/data"

results <- as.data.frame(data(package = .packages(all.available = TRUE))$results)
for (i in 1:nrow(results)) {
row <- results[i,]
RESULTS <- as.data.frame(data(package = .packages(all.available = TRUE))$results)

write_description <- function (name, description) {
cat(description, file = paste(DESCRIPTIONS_PATH, name, sep = "/"))
}

for (i in 1:nrow(RESULTS)) {
row <- RESULTS[i,]
package <- row$Package

library(package, warn.conflicts = FALSE, character.only = TRUE)

item <- row$Item
name <- lookup[[item]]
name <- LOOKUP[[item]]

if (is.null(name)) {
name <- item
}
Expand All @@ -23,21 +31,23 @@ for (i in 1:nrow(results)) {

name <- str_replace_all(name, "\\.", "_")

if (!(name %in% ignored) && !is.null(data) && is.data.frame(data)) {
if (!(name %in% IGNORED) && !is.null(data) && is.data.frame(data)) {
basename <- paste(name, "csv.gz", sep = ".")
file <- paste("ibis/examples/data", basename, sep = "/")
file <- paste(DATA_PATH, basename, sep = "/")

clean_data <- clean_names(data)
write_csv(clean_data, file = file, quote = "needed", na = "")

# write a column-name-uncleansed file if the clean names differ
description <- row$Title
write_description(name, description)

# write a raw-column-name file if the clean names differ
if (any(names(clean_data) != names(data))) {
raw_basename <- paste(paste(name, "raw", sep = "_"), "csv.gz", sep = ".")
raw_file <- paste("ibis/examples/data", raw_basename, sep = "/")
raw_name <- paste(name, "raw", sep = "_")
raw_file <- paste(DATA_PATH, paste(raw_name, "csv.gz", sep = "."), sep = "/")

write_csv(data, file = raw_file, quote = "needed", na = "")
write_description(raw_name, description)
}

text <- row$Title
cat(text, file = paste("ibis/examples/descriptions", name, sep = "/"))
}
}
Loading

0 comments on commit 521669c

Please sign in to comment.