Skip to content

Commit

Permalink
improving duckdb + autopandas performance (#469)
Browse files Browse the repository at this point in the history
* Using native DuckDB `.df()` method when using `autopandas`

* lint

* test fixes

* adds polars to the benchmark

* adds missing unit tests for ResultSet

* ResultSet no longer a subclass of list

* fix

* ci

* ci

* ci

* fix

* checking result set with non sqlalchemy

* ci

* fixes notebook
  • Loading branch information
edublancas authored May 1, 2023
1 parent ca49215 commit ad625e3
Show file tree
Hide file tree
Showing 6 changed files with 445 additions and 50 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# CHANGELOG

## 0.7.5dev
* [Feature] Using native DuckDB `.df()` method when using `autopandas`

* [Doc] documenting `%sqlcmd tables`/`%sqlcmd columns`

Expand All @@ -14,6 +15,7 @@ Never deployed due to a CI error
* [Fix] Fix `--alias` when passing an existing engine
* [Doc] Tutorial on querying excel files with pandas and jupysql ([#423](https://github.com/ploomber/jupysql/pull/423))


## 0.7.2 (2023-04-25)

* [Feature] Support for DB API 2.0 drivers ([#350](https://github.com/ploomber/jupysql/issues/350))
Expand Down
248 changes: 248 additions & 0 deletions benchmarks/duckdb.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "d47c0ad1-d9da-476e-aa51-c0c52e764415",
"metadata": {
"user_expressions": []
},
"source": [
"# Native DuckDB vs JupySQL"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b53a574b",
"metadata": {},
"outputs": [],
"source": [
"%pip install pandas polars --quiet"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "1b217aea",
"metadata": {},
"source": [
"## Pandas\n",
"\n",
"### Native DuckDB"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "4efc1b4d-52ab-482c-a5aa-a237f82f72e4",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import duckdb\n",
"from pandas import DataFrame\n",
"import numpy as np\n",
"\n",
"num_rows = 1_000_000\n",
"\n",
"df = DataFrame(np.random.randn(num_rows, 20))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "9e5d8f89-9e29-4994-a83f-268befd65ea4",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"86.6 ms ± 1.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"with duckdb.connect() as con:\n",
" %timeit con.sql(\"select * from df\").df()"
]
},
{
"cell_type": "markdown",
"id": "892bc403-4f35-4489-889e-92eb76bda725",
"metadata": {
"user_expressions": []
},
"source": [
"### JupySQL"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8dbc3068-2dcf-4937-81f2-2cfbbefbb4e1",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"%load_ext sql\n",
"%sql duckdb://\n",
"%config SqlMagic.displaycon = False\n",
"%config SqlMagic.autopandas = True"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "7780ab33-56e1-44eb-84ce-887fdb96104a",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"90.1 ms ± 1.43 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"df_ = %sql select * from df"
]
},
{
"cell_type": "markdown",
"id": "1abcf8ab-0542-43fc-9cac-5ea9363e0827",
"metadata": {
"user_expressions": []
},
"source": [
"## Polars\n",
"\n",
"### Native DuckDB"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "2ad79efc-aa17-41b1-a34c-fbed807785d5",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import polars as pl\n",
"\n",
"df_pl = pl.DataFrame(np.random.randn(num_rows, 20))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "38fed499-28c0-4815-8c40-1e38f5c71e69",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"114 ms ± 701 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"with duckdb.connect() as con:\n",
" %timeit con.sql(\"select * from df_pl\").pl()"
]
},
{
"cell_type": "markdown",
"id": "e7fb905f-4a13-4ee4-8150-30ee079e4ec4",
"metadata": {
"user_expressions": []
},
"source": [
"### JupySQL"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "bf7ab190-9b21-44bd-9329-9aab5b39a8a3",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Disabled 'autopandas' since 'autopolars' was enabled.\n"
]
}
],
"source": [
"%config SqlMagic.autopolars = True"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "53a14724-ffad-461f-aeec-674a59b93a0b",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n",
"Done.\n",
"Done.\n",
"Done.\n",
"Done.\n",
"Done.\n",
"Done.\n",
"3.93 s ± 37.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"df_ = %sql select * from df_pl"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
45 changes: 23 additions & 22 deletions src/sql/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,29 @@ class Connection:
# all connections
connections = {}

def __init__(self, engine, alias=None):
self.url = engine.url
self.name = self.assign_name(engine)
self.dialect = self.url.get_dialect()
self.engine = engine
self.session = engine.connect()

if IS_SQLALCHEMY_ONE:
self.metadata = sqlalchemy.MetaData(bind=engine)

self.connections[
alias
or (
repr(sqlalchemy.MetaData(bind=engine).bind.url)
if IS_SQLALCHEMY_ONE
else repr(engine.url)
)
] = self

self.connect_args = None
self.alias = alias
Connection.current = self

@classmethod
def _suggest_fix_no_module_found(module_name):
DEFAULT_PREFIX = "\n\n"
Expand Down Expand Up @@ -196,28 +219,6 @@ def _error_invalid_connection_info(cls, e, connect_str):
f"{e}.{cls._suggest_fix(env_var=False, connect_str=connect_str)}"
)

def __init__(self, engine, alias=None):
self.url = engine.url
self.name = self.assign_name(engine)
self.dialect = self.url.get_dialect()
self.session = engine.connect()

if IS_SQLALCHEMY_ONE:
self.metadata = sqlalchemy.MetaData(bind=engine)

self.connections[
alias
or (
repr(sqlalchemy.MetaData(bind=engine).bind.url)
if IS_SQLALCHEMY_ONE
else repr(engine.url)
)
] = self

self.connect_args = None
self.alias = alias
Connection.current = self

@classmethod
def from_connect_str(
cls, connect_str=None, connect_args=None, creator=None, alias=None
Expand Down
Loading

0 comments on commit ad625e3

Please sign in to comment.