Skip to content

Commit

Permalink
feat(api): use create_table to load example data
Browse files Browse the repository at this point in the history
BREAKING CHANGE: `ibis.examples.<example-name>.fetch` no longer forwards arbitrary keyword arguments to `read_csv`/`read_parquet`.
  • Loading branch information
jcrist committed May 24, 2023
1 parent 3ea9a21 commit 42e09a4
Show file tree
Hide file tree
Showing 5 changed files with 166 additions and 102 deletions.
34 changes: 34 additions & 0 deletions ibis/backends/tests/test_examples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import pytest

import ibis
from ibis.backends.conftest import LINUX, SANDBOXED

pytestmark = pytest.mark.examples


@pytest.mark.xfail(
LINUX and SANDBOXED,
reason="nix on linux cannot download duckdb extensions or data due to sandboxing",
raises=OSError,
)
@pytest.mark.notimpl(["dask", "datafusion", "pyspark", "sqlite"])
@pytest.mark.notyet(["bigquery", "clickhouse", "druid", "impala", "mssql", "trino"])
@pytest.mark.parametrize(
("example", "columns"),
[
(
"wowah_locations_raw",
["Map_ID", "Location_Type", "Location_Name", "Game_Version"],
),
("band_instruments", ["name", "plays"]),
(
"AwardsManagers",
["player_id", "award_id", "year_id", "lg_id", "tie", "notes"],
),
],
ids=["parquet", "csv", "csv-all-null"],
)
def test_load_examples(con, example, columns):
t = getattr(ibis.examples, example).fetch(backend=con)
assert t.columns == columns
assert t.count().execute() > 0
82 changes: 68 additions & 14 deletions ibis/examples/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

import json
import os
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Optional

import ibis
from ibis.backends.base import BaseBackend
from ibis.common.grounds import Concrete

try:
Expand All @@ -18,25 +19,79 @@
_EXAMPLES = None

_METADATA = json.loads(resources.files(__name__).joinpath("metadata.json").read_text())
_READER_FUNCS = {"csv": "read_csv", "csv.gz": "read_csv", "parquet": "read_parquet"}

# These backends load the data directly using `read_csv`/`read_parquet`. All
# other backends load the data using pyarrow, then passing it off to
# `create_table`.
_DIRECT_BACKENDS = frozenset({"duckdb", "polars"})


class Example(Concrete):
descr: Optional[str]
key: str
reader: str

def fetch(self, **kwargs: Any) -> ir.Table:
reader = getattr(ibis, self.reader)
return reader(_EXAMPLES.fetch(self.key, progressbar=True), **kwargs)

def fetch(
self,
*,
table_name: str | None = None,
backend: BaseBackend | None = None,
) -> ir.Table:
path = _EXAMPLES.fetch(self.key, progressbar=True)

if backend is None:
backend = ibis.get_backend()

if table_name is None:
table_name = ibis.util.gen_name(f"examples_{type(self).__name__}")

if backend.name in _DIRECT_BACKENDS:
# Read directly into these backends. This helps reduce memory
# usage, making the larger example datasets easier to work with.
if path.endswith(".parquet"):
return backend.read_parquet(path, table_name=table_name)
else:
return backend.read_csv(path, table_name=table_name)
else:
if path.endswith(".parquet"):
import pyarrow.parquet

table = pyarrow.parquet.read_table(path)
else:
import pyarrow.csv

# The convert options lets pyarrow treat empty strings as null for
# string columns, but not quoted empty strings.
table = pyarrow.csv.read_csv(
path,
convert_options=pyarrow.csv.ConvertOptions(
strings_can_be_null=True,
quoted_strings_can_be_null=False,
),
)

# All null columns are inferred as null-type, but not all
# backends support null-type columns. Cast to an all-null
# string column instead.
for i, field in enumerate(table.schema):
if pyarrow.types.is_null(field.type):
table = table.set_column(i, field.name, table[i].cast("string"))

# TODO: It should be possible to avoid this memtable call, once all
# backends support passing a `pyarrow.Table` to `create_table`
# directly.
obj = ibis.memtable(table)
return backend.create_table(table_name, obj, temp=True, overwrite=True)


_FETCH_DOCSTRING_TEMPLATE = """\
Fetch the {name} example.
def _make_fetch_docstring(*, name: str, reader: str):
return f"""Fetch the {name} example.
Parameters
----------
kwargs
Same as the arguments for [`ibis.{reader}`][ibis.{reader}]
table_name
The table name to use, defaults to a generated table name.
backend
The backend to load the example into. Defaults to the default backend.
Returns
-------
Expand Down Expand Up @@ -78,12 +133,11 @@ def __getattr__(name: str) -> Example:
description = spec.get("description")

_, ext = key.split(os.extsep, maxsplit=1)
reader = _READER_FUNCS[ext]

fields = {"__doc__": description} if description is not None else {}

example_class = type(name, (Example,), fields)
example_class.fetch.__doc__ = _make_fetch_docstring(name=name, reader=reader)
example = example_class(descr=description, key=key, reader=reader)
example_class.fetch.__doc__ = _FETCH_DOCSTRING_TEMPLATE.format(name=name)
example = example_class(descr=description, key=key)
setattr(ibis.examples, name, example)
return example
58 changes: 42 additions & 16 deletions ibis/examples/tests/test_examples.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import uuid

import pytest

Expand Down Expand Up @@ -26,15 +27,17 @@
* (os.environ.get("CI") is None)
)


@pytest.mark.parametrize("example", sorted(frozenset(dir(ibis.examples)) - ignored))
@pytest.mark.duckdb
@pytest.mark.backend
@pytest.mark.xfail(
xfail_linux_nix = pytest.mark.xfail(
LINUX and SANDBOXED,
reason="nix on linux cannot download duckdb extensions or data due to sandboxing",
raises=OSError,
)


@pytest.mark.parametrize("example", sorted(frozenset(dir(ibis.examples)) - ignored))
@pytest.mark.duckdb
@pytest.mark.backend
@xfail_linux_nix
def test_examples(example, tmp_path):
ex = getattr(ibis.examples, example)

Expand All @@ -56,19 +59,42 @@ def test_non_example():

@pytest.mark.duckdb
@pytest.mark.backend
@pytest.mark.xfail(
LINUX and SANDBOXED,
reason="nix on linux cannot download duckdb extensions or data due to sandboxing",
raises=OSError,
)
@xfail_linux_nix
def test_backend_arg():
con = ibis.duckdb.connect()
t = ibis.examples.penguins.fetch(backend=con)
assert t.get_name() in con.list_tables()


@pytest.mark.duckdb
@pytest.mark.backend
@xfail_linux_nix
def test_table_name_arg():
con = ibis.duckdb.connect()
name = f"penguins-{uuid.uuid4().hex}"
t = ibis.examples.penguins.fetch(backend=con, table_name=name)
assert t.get_name() == name


@pytest.mark.pandas
@pytest.mark.duckdb
@pytest.mark.backend
@xfail_linux_nix
@pytest.mark.parametrize(
("example", "expected"),
("example", "columns"),
[
("band_members", ["name", "band"]),
("ml_latest_small_links", ["movieId", "imdbId", "tmdbId"]),
("band_instruments", ["name", "plays"]),
("band_instruments2", ["artist", "plays"]),
(
"AwardsManagers",
["player_id", "award_id", "year_id", "lg_id", "tie", "notes"],
),
],
ids=["members", "instruments", "instruments2"],
ids=["parquet", "csv", "csv-all-null"],
)
def test_band(example, expected):
assert getattr(ibis.examples, example).fetch().columns == expected
@pytest.mark.parametrize("backend_name", ["duckdb", "polars", "pandas"])
def test_load_example(backend_name, example, columns):
pytest.importorskip(backend_name)
con = getattr(ibis, backend_name).connect()
t = getattr(ibis.examples, example).fetch(backend=con)
assert t.columns == columns
42 changes: 2 additions & 40 deletions ibis/expr/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -847,26 +847,7 @@ def read_csv(sources: str | Path | Sequence[str | Path], **kwargs: Any) -> ir.Ta
Examples
--------
>>> import ibis
>>> ibis.options.interactive = True
>>> t = ibis.examples.Batting_raw.fetch()
>>> t
┏━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━┓
┃ playerID ┃ yearID ┃ stint ┃ teamID ┃ lgID ┃ G ┃ AB ┃ R ┃ … ┃
┡━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━┩
│ string │ int64 │ int64 │ string │ string │ int64 │ int64 │ int64 │ … │
├───────────┼────────┼───────┼────────┼────────┼───────┼───────┼───────┼───┤
│ abercda01 │ 1871 │ 1 │ TRO │ NA │ 1 │ 4 │ 0 │ … │
│ addybo01 │ 1871 │ 1 │ RC1 │ NA │ 25 │ 118 │ 30 │ … │
│ allisar01 │ 1871 │ 1 │ CL1 │ NA │ 29 │ 137 │ 28 │ … │
│ allisdo01 │ 1871 │ 1 │ WS3 │ NA │ 27 │ 133 │ 28 │ … │
│ ansonca01 │ 1871 │ 1 │ RC1 │ NA │ 25 │ 120 │ 29 │ … │
│ armstbo01 │ 1871 │ 1 │ FW1 │ NA │ 12 │ 49 │ 9 │ … │
│ barkeal01 │ 1871 │ 1 │ RC1 │ NA │ 1 │ 4 │ 0 │ … │
│ barnero01 │ 1871 │ 1 │ BS1 │ NA │ 31 │ 157 │ 66 │ … │
│ barrebi01 │ 1871 │ 1 │ FW1 │ NA │ 1 │ 5 │ 1 │ … │
│ barrofr01 │ 1871 │ 1 │ BS1 │ NA │ 18 │ 86 │ 13 │ … │
│ … │ … │ … │ … │ … │ … │ … │ … │ … │
└───────────┴────────┴───────┴────────┴────────┴───────┴───────┴───────┴───┘
>>> t = ibis.read_csv("path/to/data.csv") # doctest: +SKIP
"""
from ibis.config import _default_backend

Expand Down Expand Up @@ -947,26 +928,7 @@ def read_parquet(sources: str | Path | Sequence[str | Path], **kwargs: Any) -> i
Examples
--------
>>> import ibis
>>> ibis.options.interactive = True
>>> t = ibis.examples.Batting_raw.fetch()
>>> t
┏━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━┓
┃ playerID ┃ yearID ┃ stint ┃ teamID ┃ lgID ┃ G ┃ AB ┃ R ┃ … ┃
┡━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━┩
│ string │ int64 │ int64 │ string │ string │ int64 │ int64 │ int64 │ … │
├───────────┼────────┼───────┼────────┼────────┼───────┼───────┼───────┼───┤
│ abercda01 │ 1871 │ 1 │ TRO │ NA │ 1 │ 4 │ 0 │ … │
│ addybo01 │ 1871 │ 1 │ RC1 │ NA │ 25 │ 118 │ 30 │ … │
│ allisar01 │ 1871 │ 1 │ CL1 │ NA │ 29 │ 137 │ 28 │ … │
│ allisdo01 │ 1871 │ 1 │ WS3 │ NA │ 27 │ 133 │ 28 │ … │
│ ansonca01 │ 1871 │ 1 │ RC1 │ NA │ 25 │ 120 │ 29 │ … │
│ armstbo01 │ 1871 │ 1 │ FW1 │ NA │ 12 │ 49 │ 9 │ … │
│ barkeal01 │ 1871 │ 1 │ RC1 │ NA │ 1 │ 4 │ 0 │ … │
│ barnero01 │ 1871 │ 1 │ BS1 │ NA │ 31 │ 157 │ 66 │ … │
│ barrebi01 │ 1871 │ 1 │ FW1 │ NA │ 1 │ 5 │ 1 │ … │
│ barrofr01 │ 1871 │ 1 │ BS1 │ NA │ 18 │ 86 │ 13 │ … │
│ … │ … │ … │ … │ … │ … │ … │ … │ … │
└───────────┴────────┴───────┴────────┴────────┴───────┴───────┴───────┴───┘
>>> t = ibis.read_parquet("path/to/data.parquet") # doctest: +SKIP
"""
from ibis.config import _default_backend

Expand Down
52 changes: 20 additions & 32 deletions ibis/expr/types/relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,22 +598,16 @@ def columns(self) -> list[str]:
--------
>>> import ibis
>>> ibis.options.interactive = True
>>> t = ibis.examples.starwars.fetch()
>>> t = ibis.examples.penguins.fetch()
>>> t.columns
['name',
'height',
'mass',
'hair_color',
'skin_color',
'eye_color',
'birth_year',
['species',
'island',
'bill_length_mm',
'bill_depth_mm',
'flipper_length_mm',
'body_mass_g',
'sex',
'gender',
'homeworld',
'species',
'films',
'vehicles',
'starships']
'year']
"""
return list(self.schema().names)

Expand All @@ -629,23 +623,17 @@ def schema(self) -> sch.Schema:
--------
>>> import ibis
>>> ibis.options.interactive = True
>>> t = ibis.examples.starwars.fetch()
>>> t = ibis.examples.penguins.fetch()
>>> t.schema()
ibis.Schema {
name string
height int64
mass float64
hair_color string
skin_color string
eye_color string
birth_year float64
sex string
gender string
homeworld string
species string
films string
vehicles string
starships string
species string
island string
bill_length_mm float64
bill_depth_mm float64
flipper_length_mm int64
body_mass_g int64
sex string
year int64
}
"""
return self.op().schema
Expand Down Expand Up @@ -2298,7 +2286,7 @@ def info(self) -> Table:
--------
>>> import ibis
>>> ibis.options.interactive = True
>>> t = ibis.examples.penguins.fetch(table_name="penguins")
>>> t = ibis.examples.penguins.fetch()
>>> t.info()
┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━┓
┃ name ┃ type ┃ nullable ┃ nulls ┃ non_nulls ┃ null_frac ┃ … ┃
Expand Down Expand Up @@ -2752,7 +2740,7 @@ def cache(self) -> Table:
--------
>>> import ibis
>>> ibis.options.interactive = True
>>> t = ibis.examples.penguins.fetch(table_name="penguins")
>>> t = ibis.examples.penguins.fetch()
>>> cached_penguins = t.mutate(computation="Heavy Computation").cache()
>>> cached_penguins
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━┓
Expand Down Expand Up @@ -2889,7 +2877,7 @@ def pivot_longer(
Simliarly for a different example dataset, we convert names to values
but using a different selector and the default `values_to` value.
>>> world_bank_pop = ibis.examples.world_bank_pop_raw.fetch(header=1)
>>> world_bank_pop = ibis.examples.world_bank_pop_raw.fetch()
>>> world_bank_pop.head()
┏━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━┓
┃ country ┃ indicator ┃ 2000 ┃ 2001 ┃ 2002 ┃ … ┃
Expand Down

0 comments on commit 42e09a4

Please sign in to comment.