refactor(examples): replace pooch with lighter weight pins

ibis-project · Sep 6, 2023 · 521669c · 521669c
1 parent 8dabefd
commit 521669c
Show file tree

Hide file tree

Showing 13 changed files with 1,193 additions and 1,510 deletions.
diff --git a/ibis/backends/tests/test_examples.py b/ibis/backends/tests/test_examples.py
@@ -3,15 +3,14 @@
 import pytest
 
 import ibis
-from ibis.conftest import LINUX, SANDBOXED
+from ibis.conftest import LINUX, MACOS, SANDBOXED
 
 pytestmark = pytest.mark.examples
 
 
-@pytest.mark.xfail(
-    LINUX and SANDBOXED,
+@pytest.mark.skipif(
+    (LINUX or MACOS) and SANDBOXED,
     reason="nix on linux cannot download duckdb extensions or data due to sandboxing",
-    raises=OSError,
 )
 @pytest.mark.notimpl(["dask", "datafusion", "pyspark"])
 @pytest.mark.notyet(["bigquery", "clickhouse", "druid", "impala", "mssql", "trino"])

diff --git a/ibis/examples/__init__.py b/ibis/examples/__init__.py
@@ -1,11 +1,9 @@
 from __future__ import annotations
 
+import functools
 import json
-import os
 from typing import TYPE_CHECKING, Optional
 
-import filelock
-
 import ibis
 from ibis.common.grounds import Concrete
 
@@ -18,38 +16,33 @@
     import ibis.expr.types as ir
     from ibis.backends.base import BaseBackend
 
-_EXAMPLES = None
-
-_METADATA = json.loads(resources.files(__name__).joinpath("metadata.json").read_text())
-
 # These backends load the data directly using `read_csv`/`read_parquet`. All
 # other backends load the data using pyarrow, then passing it off to
 # `create_table`.
 _DIRECT_BACKENDS = frozenset({"duckdb", "polars"})
 
 
 class Example(Concrete):
-    descr: Optional[str]
-    key: str
+    name: str
+    help: Optional[str]
 
     def fetch(
         self,
         *,
         table_name: str | None = None,
         backend: BaseBackend | None = None,
     ) -> ir.Table:
-        key = self.key
-        # lock to ensure we don't clobber the file if fetched in another
-        # process
-        _EXAMPLES.abspath.mkdir(parents=True, exist_ok=True)
-        with filelock.FileLock(_EXAMPLES.abspath / f"{key}.lock"):
-            path = _EXAMPLES.fetch(key, progressbar=True)
-
         if backend is None:
             backend = ibis.get_backend()
 
+        name = self.name
+
         if table_name is None:
-            table_name = ibis.util.gen_name(f"examples_{type(self).__name__}")
+            table_name = ibis.util.gen_name(f"examples_{name}")
+
+        board = _get_board()
+
+        (path,) = board.pin_download(name)
 
         if backend.name in _DIRECT_BACKENDS:
             # Read directly into these backends. This helps reduce memory
@@ -111,40 +104,40 @@ def fetch(
 >>> t = ibis.examples.{name}.fetch()
 """
 
+_BUCKET = "ibis-pins"
 
-def __dir__() -> list[str]:
-    return sorted(_METADATA.keys())
 
+@functools.cache
+def _get_metadata():
+    return json.loads(resources.files(__name__).joinpath("metadata.json").read_text())
 
-def __getattr__(name: str) -> Example:
-    global _EXAMPLES  # noqa: PLW0603
 
-    if _EXAMPLES is None:
-        import pooch
+@functools.cache
+def _get_board():
+    import pins
+
+    return pins.board_gcs(_BUCKET)
 
-        _EXAMPLES = pooch.create(
-            path=pooch.os_cache("ibis-framework"),
-            # the trailing slash matters here
-            base_url="https://storage.googleapis.com/ibis-examples/data/",
-            version=ibis.__version__,
-            env="IBIS_EXAMPLES_DATA",
-        )
-        with resources.files(__name__).joinpath("registry.txt").open(mode="r") as _f:
-            _EXAMPLES.load_registry(_f)
 
-    spec = _METADATA.get(name, {})
+@functools.cache
+def __dir__() -> list[str]:
+    return sorted(_get_metadata().keys())
+
 
-    if (key := spec.get("key")) is None:
-        raise AttributeError(name)
+def __getattr__(name: str) -> Example:
+    try:
+        meta = _get_metadata()
 
-    description = spec.get("description")
+        description = meta[name].get("description")
 
-    _, ext = key.split(os.extsep, maxsplit=1)
+        fields = {"__doc__": description} if description is not None else {}
 
-    fields = {"__doc__": description} if description is not None else {}
+        example_class = type(name, (Example,), fields)
+        example_class.fetch.__doc__ = _FETCH_DOCSTRING_TEMPLATE.format(name=name)
 
-    example_class = type(name, (Example,), fields)
-    example_class.fetch.__doc__ = _FETCH_DOCSTRING_TEMPLATE.format(name=name)
-    example = example_class(descr=description, key=key)
-    setattr(ibis.examples, name, example)
-    return example
+        example = example_class(name=name, help=description)
+        setattr(ibis.examples, name, example)
+    except Exception as e:  # noqa: BLE001
+        raise AttributeError(name) from e
+    else:
+        return example
diff --git a/ibis/examples/gen_examples.R b/ibis/examples/gen_examples.R
@@ -3,18 +3,26 @@ library(stringr)
 library(palmerpenguins)
 library(janitor)
 
-lookup <- list("penguins_raw (penguins)" = "penguins_raw")
-ignored <- c("sim1", "sim2", "sim3", "sim4", "table1", "table2", "table3", "table4a", "table4b", "table5", "Animals", "Oats", "Muscle", "Melanoma")
+LOOKUP <- list("penguins_raw (penguins)" = "penguins_raw")
+IGNORED <- c("sim1", "sim2", "sim3", "sim4", "table1", "table2", "table3", "table4a", "table4b", "table5", "Animals", "Oats", "Muscle", "Melanoma")
+DESCRIPTIONS_PATH <- "ibis/examples/descriptions"
+DATA_PATH <- "ibis/examples/data"
 
-results <- as.data.frame(data(package = .packages(all.available = TRUE))$results)
-for (i in 1:nrow(results)) {
-    row <- results[i,]
+RESULTS <- as.data.frame(data(package = .packages(all.available = TRUE))$results)
+
+write_description <- function (name, description) {
+    cat(description, file = paste(DESCRIPTIONS_PATH, name, sep = "/"))
+}
+
+for (i in 1:nrow(RESULTS)) {
+    row <- RESULTS[i,]
     package <- row$Package
 
     library(package, warn.conflicts = FALSE, character.only = TRUE)
 
     item <- row$Item
-    name <- lookup[[item]]
+    name <- LOOKUP[[item]]
+
     if (is.null(name)) {
         name <- item
     }
@@ -23,21 +31,23 @@ for (i in 1:nrow(results)) {
 
     name <- str_replace_all(name, "\\.", "_")
 
-    if (!(name %in% ignored) && !is.null(data) && is.data.frame(data)) {
+    if (!(name %in% IGNORED) && !is.null(data) && is.data.frame(data)) {
         basename <- paste(name, "csv.gz", sep = ".")
-        file <- paste("ibis/examples/data", basename, sep = "/")
+        file <- paste(DATA_PATH, basename, sep = "/")
 
         clean_data <- clean_names(data)
         write_csv(clean_data, file = file, quote = "needed", na = "")
 
-        # write a column-name-uncleansed file if the clean names differ
+        description <- row$Title
+        write_description(name, description)
+
+        # write a raw-column-name file if the clean names differ
         if (any(names(clean_data) != names(data))) {
-            raw_basename <- paste(paste(name, "raw", sep = "_"), "csv.gz", sep = ".")
-            raw_file <- paste("ibis/examples/data", raw_basename, sep = "/")
+            raw_name <- paste(name, "raw", sep = "_")
+            raw_file <- paste(DATA_PATH, paste(raw_name, "csv.gz", sep = "."), sep = "/")
+
             write_csv(data, file = raw_file, quote = "needed", na = "")
+            write_description(raw_name, description)
         }
-
-        text <- row$Title
-        cat(text, file = paste("ibis/examples/descriptions", name, sep = "/"))
     }
 }