feat(api): use create_table to load example data

BREAKING CHANGE: `ibis.examples.<example-name>.fetch` no longer forwards arbitrary keyword arguments to `read_csv`/`read_parquet`.
ibis-project · May 24, 2023 · 42e09a4 · 42e09a4
1 parent 3ea9a21
commit 42e09a4
Show file tree

Hide file tree

Showing 5 changed files with 166 additions and 102 deletions.
diff --git a/ibis/backends/tests/test_examples.py b/ibis/backends/tests/test_examples.py
@@ -0,0 +1,34 @@
+import pytest
+
+import ibis
+from ibis.backends.conftest import LINUX, SANDBOXED
+
+pytestmark = pytest.mark.examples
+
+
+@pytest.mark.xfail(
+    LINUX and SANDBOXED,
+    reason="nix on linux cannot download duckdb extensions or data due to sandboxing",
+    raises=OSError,
+)
+@pytest.mark.notimpl(["dask", "datafusion", "pyspark", "sqlite"])
+@pytest.mark.notyet(["bigquery", "clickhouse", "druid", "impala", "mssql", "trino"])
+@pytest.mark.parametrize(
+    ("example", "columns"),
+    [
+        (
+            "wowah_locations_raw",
+            ["Map_ID", "Location_Type", "Location_Name", "Game_Version"],
+        ),
+        ("band_instruments", ["name", "plays"]),
+        (
+            "AwardsManagers",
+            ["player_id", "award_id", "year_id", "lg_id", "tie", "notes"],
+        ),
+    ],
+    ids=["parquet", "csv", "csv-all-null"],
+)
+def test_load_examples(con, example, columns):
+    t = getattr(ibis.examples, example).fetch(backend=con)
+    assert t.columns == columns
+    assert t.count().execute() > 0
diff --git a/ibis/examples/__init__.py b/ibis/examples/__init__.py
@@ -2,9 +2,10 @@
 
 import json
 import os
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Optional
 
 import ibis
+from ibis.backends.base import BaseBackend
 from ibis.common.grounds import Concrete
 
 try:
@@ -18,25 +19,79 @@
 _EXAMPLES = None
 
 _METADATA = json.loads(resources.files(__name__).joinpath("metadata.json").read_text())
-_READER_FUNCS = {"csv": "read_csv", "csv.gz": "read_csv", "parquet": "read_parquet"}
+
+# These backends load the data directly using `read_csv`/`read_parquet`. All
+# other backends load the data using pyarrow, then passing it off to
+# `create_table`.
+_DIRECT_BACKENDS = frozenset({"duckdb", "polars"})
 
 
 class Example(Concrete):
     descr: Optional[str]
     key: str
-    reader: str
-
-    def fetch(self, **kwargs: Any) -> ir.Table:
-        reader = getattr(ibis, self.reader)
-        return reader(_EXAMPLES.fetch(self.key, progressbar=True), **kwargs)
 
+    def fetch(
+        self,
+        *,
+        table_name: str | None = None,
+        backend: BaseBackend | None = None,
+    ) -> ir.Table:
+        path = _EXAMPLES.fetch(self.key, progressbar=True)
+
+        if backend is None:
+            backend = ibis.get_backend()
+
+        if table_name is None:
+            table_name = ibis.util.gen_name(f"examples_{type(self).__name__}")
+
+        if backend.name in _DIRECT_BACKENDS:
+            # Read directly into these backends. This helps reduce memory
+            # usage, making the larger example datasets easier to work with.
+            if path.endswith(".parquet"):
+                return backend.read_parquet(path, table_name=table_name)
+            else:
+                return backend.read_csv(path, table_name=table_name)
+        else:
+            if path.endswith(".parquet"):
+                import pyarrow.parquet
+
+                table = pyarrow.parquet.read_table(path)
+            else:
+                import pyarrow.csv
+
+                # The convert options lets pyarrow treat empty strings as null for
+                # string columns, but not quoted empty strings.
+                table = pyarrow.csv.read_csv(
+                    path,
+                    convert_options=pyarrow.csv.ConvertOptions(
+                        strings_can_be_null=True,
+                        quoted_strings_can_be_null=False,
+                    ),
+                )
+
+                # All null columns are inferred as null-type, but not all
+                # backends support null-type columns. Cast to an all-null
+                # string column instead.
+                for i, field in enumerate(table.schema):
+                    if pyarrow.types.is_null(field.type):
+                        table = table.set_column(i, field.name, table[i].cast("string"))
+
+            # TODO: It should be possible to avoid this memtable call, once all
+            # backends support passing a `pyarrow.Table` to `create_table`
+            # directly.
+            obj = ibis.memtable(table)
+            return backend.create_table(table_name, obj, temp=True, overwrite=True)
+
+
+_FETCH_DOCSTRING_TEMPLATE = """\
+Fetch the {name} example.
 
-def _make_fetch_docstring(*, name: str, reader: str):
-    return f"""Fetch the {name} example.
 Parameters
 ----------
-kwargs
-    Same as the arguments for [`ibis.{reader}`][ibis.{reader}]
+table_name
+    The table name to use, defaults to a generated table name.
+backend
+    The backend to load the example into. Defaults to the default backend.
 
 Returns
 -------
@@ -78,12 +133,11 @@ def __getattr__(name: str) -> Example:
     description = spec.get("description")
 
     _, ext = key.split(os.extsep, maxsplit=1)
-    reader = _READER_FUNCS[ext]
 
     fields = {"__doc__": description} if description is not None else {}
 
     example_class = type(name, (Example,), fields)
-    example_class.fetch.__doc__ = _make_fetch_docstring(name=name, reader=reader)
-    example = example_class(descr=description, key=key, reader=reader)
+    example_class.fetch.__doc__ = _FETCH_DOCSTRING_TEMPLATE.format(name=name)
+    example = example_class(descr=description, key=key)
     setattr(ibis.examples, name, example)
     return example
diff --git a/ibis/examples/tests/test_examples.py b/ibis/examples/tests/test_examples.py
@@ -1,4 +1,5 @@
 import os
+import uuid
 
 import pytest
 
@@ -26,15 +27,17 @@
     * (os.environ.get("CI") is None)
 )
 
-
-@pytest.mark.parametrize("example", sorted(frozenset(dir(ibis.examples)) - ignored))
-@pytest.mark.duckdb
-@pytest.mark.backend
-@pytest.mark.xfail(
+xfail_linux_nix = pytest.mark.xfail(
     LINUX and SANDBOXED,
     reason="nix on linux cannot download duckdb extensions or data due to sandboxing",
     raises=OSError,
 )
+
+
+@pytest.mark.parametrize("example", sorted(frozenset(dir(ibis.examples)) - ignored))
+@pytest.mark.duckdb
+@pytest.mark.backend
+@xfail_linux_nix
 def test_examples(example, tmp_path):
     ex = getattr(ibis.examples, example)
 
@@ -56,19 +59,42 @@ def test_non_example():
 
 @pytest.mark.duckdb
 @pytest.mark.backend
-@pytest.mark.xfail(
-    LINUX and SANDBOXED,
-    reason="nix on linux cannot download duckdb extensions or data due to sandboxing",
-    raises=OSError,
-)
+@xfail_linux_nix
+def test_backend_arg():
+    con = ibis.duckdb.connect()
+    t = ibis.examples.penguins.fetch(backend=con)
+    assert t.get_name() in con.list_tables()
+
+
+@pytest.mark.duckdb
+@pytest.mark.backend
+@xfail_linux_nix
+def test_table_name_arg():
+    con = ibis.duckdb.connect()
+    name = f"penguins-{uuid.uuid4().hex}"
+    t = ibis.examples.penguins.fetch(backend=con, table_name=name)
+    assert t.get_name() == name
+
+
+@pytest.mark.pandas
+@pytest.mark.duckdb
+@pytest.mark.backend
+@xfail_linux_nix
 @pytest.mark.parametrize(
-    ("example", "expected"),
+    ("example", "columns"),
     [
-        ("band_members", ["name", "band"]),
+        ("ml_latest_small_links", ["movieId", "imdbId", "tmdbId"]),
         ("band_instruments", ["name", "plays"]),
-        ("band_instruments2", ["artist", "plays"]),
+        (
+            "AwardsManagers",
+            ["player_id", "award_id", "year_id", "lg_id", "tie", "notes"],
+        ),
     ],
-    ids=["members", "instruments", "instruments2"],
+    ids=["parquet", "csv", "csv-all-null"],
 )
-def test_band(example, expected):
-    assert getattr(ibis.examples, example).fetch().columns == expected
+@pytest.mark.parametrize("backend_name", ["duckdb", "polars", "pandas"])
+def test_load_example(backend_name, example, columns):
+    pytest.importorskip(backend_name)
+    con = getattr(ibis, backend_name).connect()
+    t = getattr(ibis.examples, example).fetch(backend=con)
+    assert t.columns == columns
diff --git a/ibis/expr/api.py b/ibis/expr/api.py
@@ -847,26 +847,7 @@ def read_csv(sources: str | Path | Sequence[str | Path], **kwargs: Any) -> ir.Ta
     Examples
     --------
     >>> import ibis
-    >>> ibis.options.interactive = True
-    >>> t = ibis.examples.Batting_raw.fetch()
-    >>> t
-    ┏━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━┓
-    ┃ playerID  ┃ yearID ┃ stint ┃ teamID ┃ lgID   ┃ G     ┃ AB    ┃ R     ┃ … ┃
-    ┡━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━┩
-    │ string    │ int64  │ int64 │ string │ string │ int64 │ int64 │ int64 │ … │
-    ├───────────┼────────┼───────┼────────┼────────┼───────┼───────┼───────┼───┤
-    │ abercda01 │   1871 │     1 │ TRO    │ NA     │     1 │     4 │     0 │ … │
-    │ addybo01  │   1871 │     1 │ RC1    │ NA     │    25 │   118 │    30 │ … │
-    │ allisar01 │   1871 │     1 │ CL1    │ NA     │    29 │   137 │    28 │ … │
-    │ allisdo01 │   1871 │     1 │ WS3    │ NA     │    27 │   133 │    28 │ … │
-    │ ansonca01 │   1871 │     1 │ RC1    │ NA     │    25 │   120 │    29 │ … │
-    │ armstbo01 │   1871 │     1 │ FW1    │ NA     │    12 │    49 │     9 │ … │
-    │ barkeal01 │   1871 │     1 │ RC1    │ NA     │     1 │     4 │     0 │ … │
-    │ barnero01 │   1871 │     1 │ BS1    │ NA     │    31 │   157 │    66 │ … │
-    │ barrebi01 │   1871 │     1 │ FW1    │ NA     │     1 │     5 │     1 │ … │
-    │ barrofr01 │   1871 │     1 │ BS1    │ NA     │    18 │    86 │    13 │ … │
-    │ …         │      … │     … │ …      │ …      │     … │     … │     … │ … │
-    └───────────┴────────┴───────┴────────┴────────┴───────┴───────┴───────┴───┘
+    >>> t = ibis.read_csv("path/to/data.csv")  # doctest: +SKIP
     """
     from ibis.config import _default_backend
 
@@ -947,26 +928,7 @@ def read_parquet(sources: str | Path | Sequence[str | Path], **kwargs: Any) -> i
     Examples
     --------
     >>> import ibis
-    >>> ibis.options.interactive = True
-    >>> t = ibis.examples.Batting_raw.fetch()
-    >>> t
-    ┏━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━┓
-    ┃ playerID  ┃ yearID ┃ stint ┃ teamID ┃ lgID   ┃ G     ┃ AB    ┃ R     ┃ … ┃
-    ┡━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━┩
-    │ string    │ int64  │ int64 │ string │ string │ int64 │ int64 │ int64 │ … │
-    ├───────────┼────────┼───────┼────────┼────────┼───────┼───────┼───────┼───┤
-    │ abercda01 │   1871 │     1 │ TRO    │ NA     │     1 │     4 │     0 │ … │
-    │ addybo01  │   1871 │     1 │ RC1    │ NA     │    25 │   118 │    30 │ … │
-    │ allisar01 │   1871 │     1 │ CL1    │ NA     │    29 │   137 │    28 │ … │
-    │ allisdo01 │   1871 │     1 │ WS3    │ NA     │    27 │   133 │    28 │ … │
-    │ ansonca01 │   1871 │     1 │ RC1    │ NA     │    25 │   120 │    29 │ … │
-    │ armstbo01 │   1871 │     1 │ FW1    │ NA     │    12 │    49 │     9 │ … │
-    │ barkeal01 │   1871 │     1 │ RC1    │ NA     │     1 │     4 │     0 │ … │
-    │ barnero01 │   1871 │     1 │ BS1    │ NA     │    31 │   157 │    66 │ … │
-    │ barrebi01 │   1871 │     1 │ FW1    │ NA     │     1 │     5 │     1 │ … │
-    │ barrofr01 │   1871 │     1 │ BS1    │ NA     │    18 │    86 │    13 │ … │
-    │ …         │      … │     … │ …      │ …      │     … │     … │     … │ … │
-    └───────────┴────────┴───────┴────────┴────────┴───────┴───────┴───────┴───┘
+    >>> t = ibis.read_parquet("path/to/data.parquet")  # doctest: +SKIP
     """
     from ibis.config import _default_backend
 

diff --git a/ibis/expr/types/relations.py b/ibis/expr/types/relations.py
@@ -598,22 +598,16 @@ def columns(self) -> list[str]:
         --------
         >>> import ibis
         >>> ibis.options.interactive = True
-        >>> t = ibis.examples.starwars.fetch()
+        >>> t = ibis.examples.penguins.fetch()
         >>> t.columns
-        ['name',
-         'height',
-         'mass',
-         'hair_color',
-         'skin_color',
-         'eye_color',
-         'birth_year',
+        ['species',
+         'island',
+         'bill_length_mm',
+         'bill_depth_mm',
+         'flipper_length_mm',
+         'body_mass_g',
          'sex',
-         'gender',
-         'homeworld',
-         'species',
-         'films',
-         'vehicles',
-         'starships']
+         'year']
         """
         return list(self.schema().names)
 
@@ -629,23 +623,17 @@ def schema(self) -> sch.Schema:
         --------
         >>> import ibis
         >>> ibis.options.interactive = True
-        >>> t = ibis.examples.starwars.fetch()
+        >>> t = ibis.examples.penguins.fetch()
         >>> t.schema()
         ibis.Schema {
-          name        string
-          height      int64
-          mass        float64
-          hair_color  string
-          skin_color  string
-          eye_color   string
-          birth_year  float64
-          sex         string
-          gender      string
-          homeworld   string
-          species     string
-          films       string
-          vehicles    string
-          starships   string
+          species            string
+          island             string
+          bill_length_mm     float64
+          bill_depth_mm      float64
+          flipper_length_mm  int64
+          body_mass_g        int64
+          sex                string
+          year               int64
         }
         """
         return self.op().schema
@@ -2298,7 +2286,7 @@ def info(self) -> Table:
         --------
         >>> import ibis
         >>> ibis.options.interactive = True
-        >>> t = ibis.examples.penguins.fetch(table_name="penguins")
+        >>> t = ibis.examples.penguins.fetch()
         >>> t.info()
         ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━┓
         ┃ name              ┃ type    ┃ nullable ┃ nulls ┃ non_nulls ┃ null_frac ┃ … ┃
@@ -2752,7 +2740,7 @@ def cache(self) -> Table:
         --------
         >>> import ibis
         >>> ibis.options.interactive = True
-        >>> t = ibis.examples.penguins.fetch(table_name="penguins")
+        >>> t = ibis.examples.penguins.fetch()
         >>> cached_penguins = t.mutate(computation="Heavy Computation").cache()
         >>> cached_penguins
         ┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━┓
@@ -2889,7 +2877,7 @@ def pivot_longer(
         Simliarly for a different example dataset, we convert names to values
         but using a different selector and the default `values_to` value.
 
-        >>> world_bank_pop = ibis.examples.world_bank_pop_raw.fetch(header=1)
+        >>> world_bank_pop = ibis.examples.world_bank_pop_raw.fetch()
         >>> world_bank_pop.head()
         ┏━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━┓
         ┃ country ┃ indicator   ┃ 2000         ┃ 2001         ┃ 2002         ┃ … ┃