Skip to content

Commit

Permalink
feat: add duckdb backend
Browse files Browse the repository at this point in the history
Co-authored-by: Gil Forsyth <gil@forsyth.dev>

fix(duckdb): apply schema to output cursor

fix(duckdb): add overrides for log, round, mod

fix(duckdb): add xfails or implementations for test_agg through test_param

feat(duckdb): add timestamp_diff, regex_extract, translate

test(duckdb): add duckdb marker, mark notimpl tests

chore(duckdb): add to setup.py

chore(duckdb): don't gate import in datamgr.py

chore(duckdb): default path to :memory:

chore: micronit

chore(duckdb): remove broken registry code

chore(duckdb): add upper bound to dependency

test(duckdb): change notimpl to notyet for non-finite tests

chore(duckdb): add `duckdb` to github actions backends

chore: update setup.py

chore(duckdb): add load duckdb to datamgr.py

test(duckdb): add duckdb tests to xdist_group to avoid file contention

Similar to sqlite, the duckdb tests need to be run on the same process

refactor: use sqlalchemy dialect for duckdb

build: add duckdb-engine sqlalchemy dialect
test: test_client test now pass due to sqlalchemy dialect usage

chore: regen files

refactor: delete more base class code

test: add duckdb to sqlalchemy fixture

test(duckdb): notimpl coalesce tests

test: run ci-check in serial until we sort out RW on windows

chore: don't overwrite postgres operations

chore(poetry.lock): regen
  • Loading branch information
cpcloud committed Mar 1, 2022
1 parent 9153255 commit 667f2d5
Show file tree
Hide file tree
Showing 29 changed files with 619 additions and 84 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/ibis-backends.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ jobs:
backend:
- name: dask
title: Dask
- name: duckdb
title: DuckDB
- name: pandas
title: Pandas
- name: sqlite
Expand Down Expand Up @@ -99,7 +101,7 @@ jobs:

- name: run non-pyspark tests
if: ${{ matrix.backend.name != 'pyspark' }}
run: just ci-check -m ${{ matrix.backend.name }} --numprocesses auto --dist loadgroup
run: just ci-check -m ${{ matrix.backend.name }}

- name: run pyspark tests
if: ${{ matrix.backend.name == 'pyspark' }}
Expand Down
37 changes: 37 additions & 0 deletions ci/datamgr.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,6 +708,43 @@ def all(ctx):
future.result()


@load.command()
@click.option('-D', '--database', default='ibis_testing')
@click.option(
'-S',
'--schema',
type=click.File('rt'),
default=SCRIPT_DIR / 'schema' / 'duckdb.sql',
help='Path to SQL file that initializes the database via DDL.',
)
@click.option('-t', '--tables', multiple=True, default=TEST_TABLES)
@click.option(
'-d',
'--data-directory',
default=DATA_DIR,
type=click.Path(
exists=True,
file_okay=False,
dir_okay=True,
writable=True,
readable=True,
path_type=Path,
),
)
def duckdb(schema, tables, data_directory, database, **params):
import duckdb # noqa: F401

logger.info('Initializing DuckDB...')
conn = duckdb.connect(f"ci/ibis-testing-data/{database}.ddb")
for stmt in filter(None, map(str.strip, schema.read().split(';'))):
conn.execute(stmt)

for table in tables:
src = data_directory / f'{table}.csv'
sql = f"INSERT INTO {table} SELECT * FROM '{src}';"
conn.execute(sql)


if __name__ == '__main__':
"""
Environment Variables are automatically parsed:
Expand Down
72 changes: 72 additions & 0 deletions ci/schema/duckdb.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
DROP TABLE IF EXISTS diamonds CASCADE;

CREATE TABLE diamonds (
carat FLOAT,
cut TEXT,
color TEXT,
clarity TEXT,
depth FLOAT,
"table" FLOAT,
price BIGINT,
x FLOAT,
y FLOAT,
z FLOAT
);

DROP TABLE IF EXISTS batting CASCADE;

CREATE TABLE batting (
"playerID" TEXT,
"yearID" BIGINT,
stint BIGINT,
"teamID" TEXT,
"lgID" TEXT,
"G" BIGINT,
"AB" BIGINT,
"R" BIGINT,
"H" BIGINT,
"X2B" BIGINT,
"X3B" BIGINT,
"HR" BIGINT,
"RBI" BIGINT,
"SB" BIGINT,
"CS" BIGINT,
"BB" BIGINT,
"SO" BIGINT,
"IBB" BIGINT,
"HBP" BIGINT,
"SH" BIGINT,
"SF" BIGINT,
"GIDP" BIGINT
);

DROP TABLE IF EXISTS awards_players CASCADE;

CREATE TABLE awards_players (
"playerID" TEXT,
"awardID" TEXT,
"yearID" BIGINT,
"lgID" TEXT,
tie TEXT,
notes TEXT
);

DROP TABLE IF EXISTS functional_alltypes CASCADE;

CREATE TABLE functional_alltypes (
"index" BIGINT,
"Unnamed: 0" BIGINT,
id INTEGER,
bool_col BOOLEAN,
tinyint_col SMALLINT,
smallint_col SMALLINT,
int_col INTEGER,
bigint_col BIGINT,
float_col REAL,
double_col DOUBLE PRECISION,
date_string_col TEXT,
string_col TEXT,
timestamp_col TIMESTAMP WITHOUT TIME ZONE,
year INTEGER,
month INTEGER
);
6 changes: 5 additions & 1 deletion ibis/backends/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ def pytest_collection_modifyitems(session, config, items):

if "sqlite" in item.nodeid:
item.add_marker(pytest.mark.xdist_group(name="sqlite"))
if "duckdb" in item.nodeid:
item.add_marker(pytest.mark.xdist_group(name="duckdb"))


@lru_cache(maxsize=None)
Expand Down Expand Up @@ -307,7 +309,9 @@ def ddl_con(ddl_backend):


@pytest.fixture(
params=_get_backends_to_test(keep=("sqlite", "postgres", "mysql")),
params=_get_backends_to_test(
keep=("sqlite", "postgres", "mysql", "duckdb")
),
scope='session',
)
def alchemy_backend(request, data_directory):
Expand Down
82 changes: 82 additions & 0 deletions ibis/backends/duckdb/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""DuckDB backend."""

from __future__ import annotations

import os

import duckdb
import sqlalchemy as sa

import ibis.expr.schema as sch
import ibis.expr.types as ir
from ibis.backends.base.sql.alchemy import BaseAlchemyBackend

from .compiler import DuckDBSQLCompiler


# TODO: AlchemyTable calls infer on the sqla_table attribute in the super call
# in DuckDBTable -- it wants a sa.Table this is a hack to get it to make it
# through the parent __init__ without barfing
@sch.infer.register(sa.sql.selectable.TableClause)
def schema_passthrough(table, schema=None):
return schema


class Backend(BaseAlchemyBackend):
name = "duckdb"
compiler = DuckDBSQLCompiler

def current_database(self):
raise NotImplementedError

@property
def version(self) -> str:
# TODO: there is a `PRAGMA version` we could use instead
try:
import importlib.metadata as importlib_metadata
except ImportError:
# TODO: remove this when Python 3.9 support is dropped
import importlib_metadata
return importlib_metadata.version("duckdb")

def do_connect(
self,
path: str = ":memory:",
read_only: bool = False,
) -> None:
"""Create an Ibis client connected to a DuckDB database."""
if path != ":memory:":
path = os.path.abspath(path)
super().do_connect(
sa.create_engine(
f"duckdb:///{path}",
connect_args={"read_only": read_only},
)
)
self._meta = sa.MetaData(bind=self.con)

def table(self, name: str, database: str | None = None) -> ir.TableExpr:
"""Create a table expression from a table in the SQLite database.
Parameters
----------
name
Table name
database
Name of the attached database that the table is located in.
Returns
-------
TableExpr
Table expression
"""
return self._sqla_table_to_expr(
self._get_sqla_table(name, schema=database)
)

def fetch_from_cursor(
self, cursor: duckdb.DuckDBPyConnection, schema: sch.Schema
):
df = cursor.cursor.fetch_df()
df = schema.apply_to(df)
return df
31 changes: 31 additions & 0 deletions ibis/backends/duckdb/compiler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import ibis.expr.operations as ops
from ibis.backends.base.sql.alchemy import (
AlchemyCompiler,
AlchemyExprTranslator,
)

from .registry import operation_registry


class DuckDBSQLExprTranslator(AlchemyExprTranslator):
_registry = operation_registry
_rewrites = AlchemyExprTranslator._rewrites.copy()
# The PostgreSQLExprTranslater maps to a `DOUBLE_PRECISION`
# type that duckdb doesn't understand, but we probably still want
# the updated `operation_registry` from postgres
_type_map = AlchemyExprTranslator._type_map.copy()


rewrites = DuckDBSQLExprTranslator.rewrites


@rewrites(ops.Any)
@rewrites(ops.All)
@rewrites(ops.NotAny)
@rewrites(ops.NotAll)
def _any_all_no_op(expr):
return expr


class DuckDBSQLCompiler(AlchemyCompiler):
translator_class = DuckDBSQLExprTranslator
78 changes: 78 additions & 0 deletions ibis/backends/duckdb/registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import sqlalchemy as sa

import ibis.expr.operations as ops
from ibis.backends.base.sql.alchemy import unary

from ..postgres.registry import fixed_arity, operation_registry

operation_registry = operation_registry.copy()


def _round(t, expr):
arg, digits = expr.op().args
sa_arg = t.translate(arg)

if digits is None:
return sa.func.round(sa_arg)

result = sa.func.round(sa_arg, t.translate(digits))
return result


def _mod(t, expr):
left, right = map(t.translate, expr.op().args)
return left % right


def _log(t, expr):
arg, base = expr.op().args
sa_arg = t.translate(arg)
if base is not None:
sa_base = t.translate(base)
if sa_base.value == 2:
return sa.func.log2(sa_arg)
elif sa_base.value == 10:
return sa.func.log(sa_arg)
else:
raise NotImplementedError
return sa.func.log(sa_base, sa_arg)
return sa.func.ln(sa_arg)


def _timestamp_from_unix(t, expr):
op = expr.op()
arg, unit = op.args
arg = t.translate(arg)

if unit in {"us", "ns"}:
raise ValueError(f"`{unit}` unit is not supported!")

if unit == "ms":
return sa.func.epoch_ms(arg)
elif unit == "s":
return sa.func.to_timestamp(arg)


# TODO(gil): this is working except the results of the
# substraction are being truncated
def _timestamp_diff(t, expr):
sa_left, sa_right = map(t.translate, expr.op().args)
ts = sa.text(f"TIMESTAMP '{sa_right.value}'")
return sa_left.op("-")(ts)


operation_registry.update(
{
ops.Round: _round,
ops.Log2: unary(sa.func.log2),
ops.Modulus: _mod,
ops.Log: _log,
ops.Translate: fixed_arity('replace', 3),
ops.DayOfWeekName: unary(sa.func.dayname),
ops.TimestampFromUNIX: _timestamp_from_unix,
# TODO(gil): this should work except sqlalchemy doesn't know how to
# render duckdb timestamps
# ops.TimestampDiff: fixed_arity('age', 2),
ops.TimestampDiff: _timestamp_diff,
}
)
Empty file.
11 changes: 11 additions & 0 deletions ibis/backends/duckdb/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from pathlib import Path

import ibis
from ibis.backends.tests.base import BackendTest, RoundAwayFromZero


class TestConf(BackendTest, RoundAwayFromZero):
@staticmethod
def connect(data_directory: Path):
path = data_directory / "ibis_testing.ddb"
return ibis.duckdb.connect(str(path)) # type: ignore
Loading

0 comments on commit 667f2d5

Please sign in to comment.