Skip to content

Commit

Permalink
feat(duckdb): add register method to duckdb backend to load parquet a…
Browse files Browse the repository at this point in the history
…nd csv files
  • Loading branch information
gforsyth authored and cpcloud committed Jun 21, 2022
1 parent 4501f3a commit 4ccc6fc
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 0 deletions.
45 changes: 45 additions & 0 deletions ibis/backends/duckdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,51 @@ def do_connect(
)
self._meta = sa.MetaData(bind=self.con)

def register(
self,
file_name: str | Path,
table_name: str | None = None,
) -> None:
"""Register an external file (csv or parquet) as a table in the current
connection database
Parameters
----------
file_name
Name of the parquet or CSV file
table_name
Name for the created table. Defaults to filename if not given
"""
file_name = Path(file_name)
suffix = "".join(file_name.suffixes).strip(".") # handles .csv.gz
if file_name.parts[0].endswith(":"):
prefix, *fname = file_name.parts
else:
prefix = "file:"
fname = file_name.parts

file_name = Path(*fname).absolute()

# Use prefix for file_type. If omitted, infer from file extension
file_type = prefix.strip(":") if prefix != "file:" else suffix
table_name = table_name or file_name.stem.replace("-", "_")
if file_type == "parquet":
view = f"""
CREATE VIEW {table_name} as SELECT * from
read_parquet('{file_name}')
"""
elif file_type.startswith("csv"):
view = f"""
CREATE VIEW {table_name} as SELECT * from
read_csv_auto('{file_name}')
"""
else:
raise TypeError(
"Only csv and parquet files can be registered with DuckDB."
)

self.con.execute(view)

def fetch_from_cursor(
self,
cursor: duckdb.DuckDBPyConnection,
Expand Down
34 changes: 34 additions & 0 deletions ibis/backends/duckdb/tests/test_register.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import contextlib
import os

import pytest

import ibis


@contextlib.contextmanager
def pushd(new_dir):
previous_dir = os.getcwd()
os.chdir(new_dir)
yield
os.chdir(previous_dir)


@pytest.mark.parametrize(
"fname, in_table_name, out_table_name",
[
("diamonds.csv", None, "diamonds"),
("csv://diamonds.csv", "Diamonds", "Diamonds"),
("parquet://batting.parquet", None, "batting"),
("batting.parquet", "baseball", "baseball"),
],
)
def test_register_file(data_directory, fname, in_table_name, out_table_name):
con = ibis.duckdb.connect()
with pushd(data_directory):
con.register(fname, table_name=in_table_name)

assert out_table_name in con.list_tables()

table = con.table(out_table_name)
assert table.count().execute() > 0

0 comments on commit 4ccc6fc

Please sign in to comment.