Skip to content

Commit

Permalink
feat(duckdb): add read_json function for consuming newline-delimite…
Browse files Browse the repository at this point in the history
…d JSON files
  • Loading branch information
cpcloud authored and gforsyth committed Jan 24, 2023
1 parent 0d319ca commit 65e65c1
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 3 deletions.
52 changes: 51 additions & 1 deletion ibis/backends/duckdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@
from ibis.backends.duckdb.compiler import DuckDBSQLCompiler
from ibis.backends.duckdb.datatypes import parse

# counters for in-memory, parquet, and csv reads
# counters for in-memory, parquet, csv, and json reads
# used if no table name is specified
pd_n = itertools.count(0)
pa_n = itertools.count(0)
csv_n = itertools.count(0)
json_n = itertools.count(0)


def normalize_filenames(source_list):
Expand Down Expand Up @@ -209,6 +210,55 @@ def _register_failure(self):
f"please call one of {msg} directly"
)

def read_json(
self,
source_list: str | list[str] | tuple[str],
table_name: str | None = None,
**kwargs,
) -> ir.Table:
"""Read newline-delimited JSON into an ibis table.
Parameters
----------
source_list
File or list of files
table_name
Optional table name
kwargs
Additional keyword arguments passed to DuckDB's `read_json_objects` function
Returns
-------
Table
An ibis table expression
"""
if not table_name:
table_name = f"ibis_read_json_{next(json_n)}"

source_list = normalize_filenames(source_list)

objects = (
sa.func.read_json_objects(
sa.func.list_value(*source_list), _format_kwargs(kwargs)
)
.table_valued("raw")
.render_derived()
)
# read a single row out to get the schema, assumes the first row is representative
json_structure_query = sa.select(sa.func.json_structure(objects.c.raw)).limit(1)

with self.begin() as con:
json_structure = con.execute(json_structure_query).scalar()
data = sa.select(sa.literal_column("s.*")).select_from(
sa.select(
sa.func.json_transform(objects.c.raw, json_structure).label("s")
).subquery()
)
view = _create_view(sa.table(table_name), data, or_replace=True)
con.execute(view)

return self.table(table_name)

def read_csv(
self,
source_list: str | list[str] | tuple[str],
Expand Down
13 changes: 13 additions & 0 deletions ibis/backends/duckdb/tests/test_register.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,19 @@ def test_read_parquet(data_directory):
assert t.count().execute()


def test_read_json(data_directory, tmp_path):
pqt = ibis.read_parquet(data_directory / "functional_alltypes.parquet")

path = tmp_path.joinpath("ft.json")
path.write_text(pqt.execute().to_json(orient="records", lines=True))

jst = ibis.read_json(path)

nrows = pqt.count().execute()
assert nrows
assert nrows == jst.count().execute()


def test_temp_directory(tmp_path):
query = sa.text("SELECT value FROM duckdb_settings() WHERE name = 'temp_directory'")

Expand Down
32 changes: 30 additions & 2 deletions ibis/expr/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
trailing_window,
window,
)
from ibis.util import experimental

if TYPE_CHECKING:
import pandas as pd
Expand Down Expand Up @@ -142,6 +143,7 @@
'random',
'range_window',
'read_csv',
'read_json',
'read_parquet',
'row_number',
'rows_with_max_lookback',
Expand Down Expand Up @@ -820,7 +822,7 @@ def row_number() -> ir.IntegerColumn:
return ops.RowNumber().to_expr()


def read_csv(sources: str | Path, **kwargs: Any) -> ir.Table:
def read_csv(sources: str | Path | Sequence[str | Path], **kwargs: Any) -> ir.Table:
"""Lazily load a CSV or set of CSVs.
Parameters
Expand All @@ -847,7 +849,33 @@ def read_csv(sources: str | Path, **kwargs: Any) -> ir.Table:
return con.read_csv(sources, **kwargs)


def read_parquet(sources: str | Path, **kwargs: Any) -> ir.Table:
@experimental
def read_json(sources: str | Path | Sequence[str | Path], **kwargs: Any) -> ir.Table:
"""Lazily load newline-delimited JSON data.
Parameters
----------
sources
A filesystem path or URL or list of same.
kwargs
DuckDB-specific keyword arguments for the file type.
Returns
-------
ir.Table
Table expression representing a file
Examples
--------
>>> t = ibis.read_json("data.json")
"""
from ibis.config import _default_backend

con = _default_backend()
return con.read_json(sources, **kwargs)


def read_parquet(sources: str | Path | Sequence[str | Path], **kwargs: Any) -> ir.Table:
"""Lazily load a parquet file or set of parquet files.
Parameters
Expand Down

0 comments on commit 65e65c1

Please sign in to comment.