Skip to content

Commit

Permalink
feat(duckdb): allow users to register fsspec filesystem with DuckDB
Browse files Browse the repository at this point in the history
  • Loading branch information
gforsyth authored and cpcloud committed Nov 10, 2023
1 parent ee94cb5 commit 6172f07
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 1 deletion.
39 changes: 39 additions & 0 deletions ibis/backends/duckdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

import pandas as pd
import torch
from fsspec import AbstractFileSystem


def normalize_filenames(source_list):
Expand Down Expand Up @@ -877,6 +878,44 @@ def attach_sqlite(
con.execute(sa.text(f"SET GLOBAL sqlite_all_varchar={all_varchar}"))
con.execute(sa.text(f"CALL sqlite_attach('{path}', overwrite={overwrite})"))

def register_filesystem(self, filesystem: AbstractFileSystem):
"""Register an `fsspec` filesystem object with DuckDB.
This allow a user to read from any `fsspec` compatible filesystem using
`read_csv`, `read_parquet`, `read_json`, etc.
::: {.callout-note}
Creating an `fsspec` filesystem requires that the corresponding
backend-specific `fsspec` helper library is installed.
e.g. to connect to Google Cloud Storage, `gcsfs` must be installed.
:::
Parameters
----------
filesystem
The fsspec filesystem object to register with DuckDB.
See https://duckdb.org/docs/guides/python/filesystems for details.
Examples
--------
>>> import ibis
>>> import fsspec
>>> gcs = fsspec.filesystem("gcs")
>>> con = ibis.duckdb.connect()
>>> con.register_filesystem(gcs)
>>> t = con.read_csv(
... "gcs://ibis-examples/data/band_members.csv.gz",
... table_name="band_members",
... )
DatabaseTable: band_members
name string
band string
"""
with self.begin() as con:
con.connection.register_filesystem(filesystem)

def _run_pre_execute_hooks(self, expr: ir.Expr) -> None:
# Warn for any tables depending on RecordBatchReaders that have already
# started being consumed.
Expand Down
17 changes: 17 additions & 0 deletions ibis/backends/duckdb/tests/test_register.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,3 +350,20 @@ def test_csv_with_slash_n_null(con, tmp_path):
t = con.read_csv(data_path, nullstr="\\N")
col = t.a.execute()
assert pd.isna(col.iat[-1])


@pytest.mark.xfail(
LINUX and SANDBOXED,
reason=("nix can't hit GCS because it is sandboxed."),
)
def test_register_filesystem_gcs(con):
import fsspec

gcs = fsspec.filesystem("gcs")

con.register_filesystem(gcs)
band_members = con.read_csv(
"gcs://ibis-examples/data/band_members.csv.gz", table_name="band_members"
)

assert band_members.count().to_pyarrow()
4 changes: 3 additions & 1 deletion ibis/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,9 @@ def normalize_filename(source: str | Path) -> str:
source = source.removeprefix(f"{prefix}://")

def _absolufy_paths(name):
if not name.startswith(("http", "s3", "az", "abfs", "abfss", "adl", "gs")):
if not name.startswith(
("http", "s3", "az", "abfs", "abfss", "adl", "gs", "gcs", "azure")
):
return os.path.abspath(name)
return name

Expand Down

0 comments on commit 6172f07

Please sign in to comment.