diff --git a/ibis/backends/duckdb/__init__.py b/ibis/backends/duckdb/__init__.py index 319ee4c56377..c1570f207539 100644 --- a/ibis/backends/duckdb/__init__.py +++ b/ibis/backends/duckdb/__init__.py @@ -36,6 +36,7 @@ import pandas as pd import torch + from fsspec import AbstractFileSystem def normalize_filenames(source_list): @@ -877,6 +878,44 @@ def attach_sqlite( con.execute(sa.text(f"SET GLOBAL sqlite_all_varchar={all_varchar}")) con.execute(sa.text(f"CALL sqlite_attach('{path}', overwrite={overwrite})")) + def register_filesystem(self, filesystem: AbstractFileSystem): + """Register an `fsspec` filesystem object with DuckDB. + + This allow a user to read from any `fsspec` compatible filesystem using + `read_csv`, `read_parquet`, `read_json`, etc. + + + ::: {.callout-note} + Creating an `fsspec` filesystem requires that the corresponding + backend-specific `fsspec` helper library is installed. + + e.g. to connect to Google Cloud Storage, `gcsfs` must be installed. + ::: + + Parameters + ---------- + filesystem + The fsspec filesystem object to register with DuckDB. + See https://duckdb.org/docs/guides/python/filesystems for details. + + Examples + -------- + >>> import ibis + >>> import fsspec + >>> gcs = fsspec.filesystem("gcs") + >>> con = ibis.duckdb.connect() + >>> con.register_filesystem(gcs) + >>> t = con.read_csv( + ... "gcs://ibis-examples/data/band_members.csv.gz", + ... table_name="band_members", + ... ) + DatabaseTable: band_members + name string + band string + """ + with self.begin() as con: + con.connection.register_filesystem(filesystem) + def _run_pre_execute_hooks(self, expr: ir.Expr) -> None: # Warn for any tables depending on RecordBatchReaders that have already # started being consumed. diff --git a/ibis/backends/duckdb/tests/test_register.py b/ibis/backends/duckdb/tests/test_register.py index fb77c52810e5..816b08903242 100644 --- a/ibis/backends/duckdb/tests/test_register.py +++ b/ibis/backends/duckdb/tests/test_register.py @@ -350,3 +350,20 @@ def test_csv_with_slash_n_null(con, tmp_path): t = con.read_csv(data_path, nullstr="\\N") col = t.a.execute() assert pd.isna(col.iat[-1]) + + +@pytest.mark.xfail( + LINUX and SANDBOXED, + reason=("nix can't hit GCS because it is sandboxed."), +) +def test_register_filesystem_gcs(con): + import fsspec + + gcs = fsspec.filesystem("gcs") + + con.register_filesystem(gcs) + band_members = con.read_csv( + "gcs://ibis-examples/data/band_members.csv.gz", table_name="band_members" + ) + + assert band_members.count().to_pyarrow() diff --git a/ibis/util.py b/ibis/util.py index 2524a7614cf1..0f7cb0e45a03 100644 --- a/ibis/util.py +++ b/ibis/util.py @@ -523,7 +523,9 @@ def normalize_filename(source: str | Path) -> str: source = source.removeprefix(f"{prefix}://") def _absolufy_paths(name): - if not name.startswith(("http", "s3", "az", "abfs", "abfss", "adl", "gs")): + if not name.startswith( + ("http", "s3", "az", "abfs", "abfss", "adl", "gs", "gcs", "azure") + ): return os.path.abspath(name) return name