Skip to content

Commit

Permalink
feat: Update for vega-datasets@3.0.0-alpha.1
Browse files Browse the repository at this point in the history
Made possible via vega/vega-datasets#681

- Removes temp files
- Removes some outdated apis
- Remove test based on removed `"points"` dataset
  • Loading branch information
dangotbanned committed Feb 7, 2025
1 parent 6c724e9 commit 51a967a
Show file tree
Hide file tree
Showing 12 changed files with 37 additions and 87 deletions.
Binary file modified altair/datasets/_metadata/metadata.csv.gz
Binary file not shown.
Binary file modified altair/datasets/_metadata/metadata.parquet
Binary file not shown.
Binary file modified altair/datasets/_metadata/schemas.json.gz
Binary file not shown.
3 changes: 1 addition & 2 deletions altair/datasets/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@
"ohlc",
"penguins",
"platformer-terrain",
"points",
"political-contributions",
"population",
"population_engineers_hurricanes",
Expand Down Expand Up @@ -151,7 +150,7 @@ class Metadata(TypedDict, total=False):
``Metadata`` keywords form constraints to filter a table like the below sample:
```
shape: (73, 13)
shape: (72, 13)
┌────────────────┬────────┬────────────────┬───┬───────────────┬───────────────┐
│ dataset_name ┆ suffix ┆ file_name ┆ … ┆ sha ┆ url │
│ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- │
Expand Down
2 changes: 1 addition & 1 deletion altair/utils/schemapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1684,7 +1684,7 @@ def with_property_setters(cls: type[TSchemaBase]) -> type[TSchemaBase]:
],
str,
] = {
"vega-datasets": "main",
"vega-datasets": "3.0.0-alpha.1",
"vega-embed": "6",
"vega-lite": "v5.21.0",
"vegafusion": "1.6.6",
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,9 @@ doc = [

[tool.altair.vega]
# Minimum/exact versions, for projects under the `vega` organization
vega-datasets = "main" # https://github.com/vega/vega-datasets
vega-embed = "6" # https://github.com/vega/vega-embed
vega-lite = "v5.21.0" # https://github.com/vega/vega-lite
vega-datasets = "3.0.0-alpha.1" # https://github.com/vega/vega-datasets
vega-embed = "6" # https://github.com/vega/vega-embed
vega-lite = "v5.21.0" # https://github.com/vega/vega-lite

[tool.hatch]
build = { include = ["/altair"], artifacts = ["altair/jupyter/js/index.js"] }
Expand Down
1 change: 0 additions & 1 deletion tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,6 @@ def test_load_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None:
"ohlc",
"penguins",
"platformer-terrain",
"points",
"political-contributions",
"population",
"population_engineers_hurricanes",
Expand Down
61 changes: 15 additions & 46 deletions tools/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import types
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal
from typing import TYPE_CHECKING, Any, ClassVar, Literal

from tools import fs
from tools.codemod import ruff
Expand All @@ -40,9 +40,7 @@
else:
from typing_extensions import TypeAlias

_PathAlias: TypeAlias = Literal[
"typing", "metadata-csv", "metadata", "schemas", "datapackage"
]
_PathAlias: TypeAlias = Literal["typing", "metadata-csv", "metadata", "schemas"]
PathMap: TypeAlias = Mapping[_PathAlias, Path]

__all__ = ["app"]
Expand All @@ -54,33 +52,19 @@


class Application:
"""
Top-level context.
Parameters
----------
out_dir_tools, out_dir_altair
Directories to store metadata files.
out_fp_typing
Path to write metadata-derived typing module.
See Also
--------
- tools.datasets.npm.Npm
"""

def __init__(
self, out_dir_tools: Path, out_dir_altair: Path, out_fp_typing: Path
) -> None:
fs.mkdir(out_dir_tools)
"""Top-level context."""

OUT_DIR: ClassVar[Path] = fs.REPO_ROOT / "altair" / "datasets"

def __init__(self) -> None:
METADATA = "metadata"
out_meta = self.OUT_DIR / "_metadata"
self.paths = types.MappingProxyType["_PathAlias", Path](
{
"typing": out_fp_typing,
"metadata-csv": out_dir_altair / f"{METADATA}.csv.gz",
"metadata": out_dir_altair / f"{METADATA}.parquet",
"schemas": out_dir_altair / "schemas.json.gz",
"datapackage": out_dir_tools / "datapackage.json",
"typing": self.OUT_DIR / "_typing.py",
"metadata-csv": out_meta / f"{METADATA}.csv.gz",
"metadata": out_meta / f"{METADATA}.parquet",
"schemas": out_meta / "schemas.json.gz",
}
)
self._npm: Npm = Npm(self.paths)
Expand All @@ -89,9 +73,7 @@ def __init__(
def npm(self) -> Npm:
return self._npm

def refresh(
self, tag: Any, /, *, include_typing: bool = False, frozen: bool = False
) -> pl.DataFrame:
def refresh(self, tag: Any, /, *, include_typing: bool = False) -> pl.DataFrame:
"""
Update and sync all dataset metadata files.
Expand All @@ -101,17 +83,9 @@ def refresh(
Branch or release version to build against.
include_typing
Regenerate ``altair.datasets._typing``.
frozen
Don't perform any requests.
.. note::
**Temporary** measure to work from ``main`` until `vega-datasets@3`_.
.. _vega-datasets@3:
https://github.com/vega/vega-datasets/issues/654
"""
print("Syncing datasets ...")
dpkg = self.npm.datapackage(tag=tag, frozen=frozen)
dpkg = self.npm.datapackage(tag=tag)
self.write_parquet(dpkg.core, self.paths["metadata"])
self.write_json_gzip(dpkg.schemas(), self.paths["schemas"])
self.write_csv_gzip(dpkg.metadata_csv(), self.paths["metadata-csv"])
Expand Down Expand Up @@ -226,9 +200,4 @@ def generate_typing(self, dpkg: datapackage.DataPackage) -> None:
ruff.write_lint_format(self.paths["typing"], contents)


_alt_datasets = fs.REPO_ROOT / "altair" / "datasets"
app = Application(
Path(__file__).parent / "_metadata",
_alt_datasets / "_metadata",
_alt_datasets / "_typing.py",
)
app = Application()
1 change: 0 additions & 1 deletion tools/datasets/_metadata/datapackage.json

This file was deleted.

8 changes: 0 additions & 8 deletions tools/datasets/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from typing import TypeAlias
else:
from typing_extensions import TypeAlias
import polars as pl

from altair.datasets._typing import Dataset, FlFieldStr

Expand Down Expand Up @@ -117,10 +116,3 @@ class Package(TypedDict):
sources: Sequence[Source]
created: str
resources: Sequence[Resource]


class ParsedPackage(TypedDict):
"""Minimal representations to write to disk."""

features: pl.DataFrame
schemas: Mapping[Dataset, Mapping[str, FlFieldStr]]
40 changes: 16 additions & 24 deletions tools/datasets/npm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import urllib.request
from pathlib import Path
from typing import TYPE_CHECKING, Any, ClassVar, Literal, NamedTuple
from urllib.request import Request

from tools.datasets import datapackage

Expand All @@ -22,7 +23,6 @@
from typing_extensions import TypeAlias
from tools.datasets import PathMap
from tools.datasets.datapackage import DataPackage
from tools.datasets.models import Package

BranchOrTag: TypeAlias = 'Literal["main"] | LiteralString'

Expand Down Expand Up @@ -54,30 +54,25 @@ def __init__(
GH=f"https://cdn.{jsdelivr}.net/gh/vega/{package}@",
)

def dataset_base_url(self, version: BranchOrTag, /) -> LiteralString:
"""
Common url prefix for all datasets derived from ``version``.
def _prefix(self, version: BranchOrTag, /) -> LiteralString:
return f"{self.url.GH if is_branch(version) else self.url.CDN}{version}/"

Notes
-----
- Encodes the endpoint at this stage
- Use github if its the only option (since its slower otherwise)
- npm only has releases/tags (not branches)
"""
return f"{self.url.GH if is_branch(version) else self.url.CDN}{version}/data/"
def dataset_base_url(self, version: BranchOrTag, /) -> LiteralString:
"""Common url prefix for all datasets derived from ``version``."""
return f"{self._prefix(version)}data/"

@property
def url(self) -> NpmUrl:
return self._url

def file_gh(
def file(
self,
branch_or_tag: BranchOrTag,
path: str,
/,
) -> Any:
"""
Request a file from the `jsdelivr GitHub`_ endpoint.
Request a file from `jsdelivr` `npm`_ or `GitHub`_ endpoints.
Parameters
----------
Expand All @@ -86,7 +81,9 @@ def file_gh(
path
Relative filepath from the root of the repo.
.. _jsdelivr GitHub:
.. _npm:
https://www.jsdelivr.com/documentation#id-npm
.. _GitHub:
https://www.jsdelivr.com/documentation#id-github
.. _branches:
https://github.com/vega/vega-datasets/branches
Expand All @@ -100,20 +97,15 @@ def file_gh(
read_fn = json.load
else:
raise NotImplementedError(path, suffix)
req = urllib.request.Request(
f"{self.url.GH}{branch_or_tag}/{path}", headers=headers
)
req = Request(f"{self._prefix(branch_or_tag)}{path}", headers=headers)
with self._opener.open(req) as response:
return read_fn(response)

def datapackage(self, *, tag: LiteralString, frozen: bool = False) -> DataPackage:
pkg: Package = (
json.loads(self.paths["datapackage"].read_text("utf-8"))
if frozen
else self.file_gh(tag, "datapackage.json")
)
def datapackage(self, *, tag: LiteralString) -> DataPackage:
return datapackage.DataPackage(
pkg, self.dataset_base_url(tag), self.paths["metadata"]
self.file(tag, "datapackage.json"),
self.dataset_base_url(tag),
self.paths["metadata"],
)


Expand Down
2 changes: 1 addition & 1 deletion tools/generate_schema_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -1405,7 +1405,7 @@ def main() -> None:
copy_schemapi_util()
vegalite_main(args.skip_download)
write_expr_module(VERSIONS.vlc_vega, output=EXPR_FILE, header=HEADER_COMMENT)
datasets.app.refresh(VERSIONS["vega-datasets"], include_typing=True, frozen=True)
datasets.app.refresh(VERSIONS["vega-datasets"], include_typing=True)

# The modules below are imported after the generation of the new schema files
# as these modules import Altair. This allows them to use the new changes
Expand Down

0 comments on commit 51a967a

Please sign in to comment.