From 11af489aac9e9fbbdb68fb93c70adedf754a5c00 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Thu, 25 Jul 2024 07:28:41 -0700 Subject: [PATCH] feat(duckdb): support arbitrary url prefixes (#9691) --- ibis/backends/duckdb/tests/test_client.py | 23 +++++++++++++++++++++++ ibis/util.py | 5 ++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/ibis/backends/duckdb/tests/test_client.py b/ibis/backends/duckdb/tests/test_client.py index b53044f81f6b..ac55ba9404ea 100644 --- a/ibis/backends/duckdb/tests/test_client.py +++ b/ibis/backends/duckdb/tests/test_client.py @@ -322,3 +322,26 @@ def test_connect_named_in_memory_db(): default_memory_db = ibis.duckdb.connect() assert "ork" not in default_memory_db.list_tables() + + +@pytest.mark.parametrize( + ("url", "method_name"), + [ + ("hf://datasets/datasets-examples/doc-formats-csv-1/data.csv", "read_csv"), + ("hf://datasets/datasets-examples/doc-formats-jsonl-1/data.jsonl", "read_json"), + ( + "hf://datasets/datasets-examples/doc-formats-parquet-1/data/train-00000-of-00001.parquet", + "read_parquet", + ), + ], + ids=["csv", "jsonl", "parquet"], +) +@pytest.mark.xfail( + LINUX and SANDBOXED, + reason="nix on linux is not allowed to access the network and cannot download the httpfs extension", + raises=duckdb.Error, +) +def test_hugging_face(con, url, method_name): + method = getattr(con, method_name) + t = method(url) + assert t.count().execute() > 0 diff --git a/ibis/util.py b/ibis/util.py index 30844b8857e0..d127f141061b 100644 --- a/ibis/util.py +++ b/ibis/util.py @@ -10,6 +10,7 @@ import itertools import operator import os +import re import sys import textwrap import types @@ -499,9 +500,7 @@ def normalize_filename(source: str | Path) -> str: source = source.removeprefix(f"{prefix}://") def _absolufy_paths(name): - if not name.startswith( - ("http", "s3", "az", "abfs", "abfss", "adl", "gs", "gcs", "azure") - ): + if re.search(r"^(?:.+)://", name) is None: return os.path.abspath(name) return name