Skip to content

Commit

Permalink
--persist-replace argument (#440)
Browse files Browse the repository at this point in the history
  • Loading branch information
tonykploomber authored and edublancas committed May 27, 2023
1 parent 0a88f46 commit bb1bf16
Show file tree
Hide file tree
Showing 6 changed files with 288 additions and 9 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## 0.7.6dev

* [Feature] Support for printing capture variables using `=<<` syntax (by [@jorisroovers](https://github.com/jorisroovers))
* [Feature] Adds `--persist-replace` argument to replace existing tables when persisting data frames (#440)

## 0.7.5 (2023-05-24)

Expand Down
30 changes: 27 additions & 3 deletions doc/integrations/pandas.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
---
jupytext:
notebook_metadata_filter: myst
cell_metadata_filter: -all
formats: md:myst
notebook_metadata_filter: myst
text_representation:
extension: .md
format_name: myst
Expand All @@ -14,7 +14,8 @@ kernelspec:
name: python3
myst:
html_meta:
description lang=en: Convert outputs from SQL queries to pandas data frames using JupySQL
description lang=en: Convert outputs from SQL queries to pandas data frames using
JupySQL
keywords: jupyter, sql, jupysql, pandas
property=og:locale: en_US
---
Expand Down Expand Up @@ -86,7 +87,9 @@ df

+++

The `--persist` argument, with the name of a DataFrame object in memory,
### `--persist`

The `--persist` argument, with the name of a DataFrame object in memory,
will create a table name in the database from the named DataFrame. Or use `--append` to add rows to an existing table by that name.

```{code-cell} ipython3
Expand All @@ -97,6 +100,27 @@ will create a table name in the database from the named DataFrame. Or use `--a
%sql SELECT * FROM df;
```

### `--persist-replace`

The `--persist-replace` performs the similiar functionaility with `--persist`,
but it will drop the existing table before inserting the new table

#### Declare the dataframe again

```{code-cell} ipython3
df = %sql SELECT * FROM writer LIMIT 1
df
```

#### Use `--persist-replace`

```{code-cell} ipython3
%sql --persist-replace df
```

#### df table is overridden

```{code-cell} ipython3
%sql SELECT * FROM df;
```

51 changes: 45 additions & 6 deletions src/sql/magic.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,12 @@ def _mutex_autopandas_autopolars(self, change):
action="store_true",
help="create a table name in the database from the named DataFrame",
)
@argument(
"-P",
"--persist-replace",
action="store_true",
help="replace the DataFrame if it exists, otherwise perform --persist",
)
@argument(
"-n",
"--no-index",
Expand Down Expand Up @@ -367,11 +373,34 @@ def interactive_execute_wrapper(**kwargs):
alias=args.alias,
)
payload["connection_info"] = conn._get_curr_sqlalchemy_connection_info()
if args.persist:
if args.persist_replace and args.append:
raise exceptions.UsageError(
"""You cannot simultaneously persist and append data to a dataframe;
please choose to utilize either one or the other."""
)
if args.persist and args.persist_replace:
warnings.warn("Please use either --persist or --persist-replace")
return self._persist_dataframe(
command.sql,
conn,
user_ns,
append=False,
index=not args.no_index,
replace=True,
)
elif args.persist:
return self._persist_dataframe(
command.sql, conn, user_ns, append=False, index=not args.no_index
)

elif args.persist_replace:
return self._persist_dataframe(
command.sql,
conn,
user_ns,
append=False,
index=not args.no_index,
replace=True,
)
if args.append:
return self._persist_dataframe(
command.sql, conn, user_ns, append=True, index=not args.no_index
Expand Down Expand Up @@ -449,7 +478,9 @@ def interactive_execute_wrapper(**kwargs):
legal_sql_identifier = re.compile(r"^[A-Za-z0-9#_$]+")

@modify_exceptions
def _persist_dataframe(self, raw, conn, user_ns, append=False, index=True):
def _persist_dataframe(
self, raw, conn, user_ns, append=False, index=True, replace=False
):
"""Implements PERSIST, which writes a DataFrame to the RDBMS"""
if not DataFrame:
raise exceptions.MissingPackageError(
Expand Down Expand Up @@ -488,14 +519,22 @@ def _persist_dataframe(self, raw, conn, user_ns, append=False, index=True):
table_name = frame_name.lower()
table_name = self.legal_sql_identifier.search(table_name).group(0)

if_exists = "append" if append else "fail"
if replace:
if_exists = "replace"
elif append:
if_exists = "append"
else:
if_exists = "fail"

try:
frame.to_sql(
table_name, conn.session.engine, if_exists=if_exists, index=index
)
except ValueError as e:
raise exceptions.ValueError(e) from e
except ValueError:
raise exceptions.ValueError(
f"""Table {table_name!r} already exists. Consider using \
--persist-replace to drop the table before persisting the data frame"""
)

return "Persisted %s" % table_name

Expand Down
1 change: 1 addition & 0 deletions src/tests/test_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ def test_args(ip, sql_magic):
"creator": None,
"section": None,
"persist": False,
"persist_replace": False,
"no_index": False,
"append": False,
"connection_arguments": None,
Expand Down
213 changes: 213 additions & 0 deletions src/tests/test_magic.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,219 @@ def test_persist_bare(ip):
assert result.error_in_exec


def get_table_rows_as_dataframe(ip, table, name=None):
"""The function will generate the pandas dataframe in the namespace
by querying the data by given table name"""
if name:
saved_df_name = name
else:
saved_df_name = f"df_{table}"
ip.run_cell(f"results = %sql SELECT * FROM {table} LIMIT 1;")
ip.run_cell(f"{saved_df_name} = results.DataFrame()")
return saved_df_name


@pytest.mark.parametrize(
"test_table, expected_result",
[
("test", [(0, 1, "foo")]),
("author", [(0, "William", "Shakespeare", 1616)]),
(
"website",
[
(
0,
"Bertold Brecht",
"https://en.wikipedia.org/wiki/Bertolt_Brecht",
1954,
)
],
),
("number_table", [(0, 4, -2)]),
],
)
def test_persist_replace_abbr_no_override(ip, test_table, expected_result):
saved_df_name = get_table_rows_as_dataframe(ip, table=test_table)
ip.run_cell(f"%sql -P sqlite:// {saved_df_name}")
out = ip.run_cell(f"%sql SELECT * FROM {saved_df_name}")
assert out.result == expected_result
assert out.error_in_exec is None


@pytest.mark.parametrize(
"test_table, expected_result",
[
("test", [(0, 1, "foo")]),
("author", [(0, "William", "Shakespeare", 1616)]),
(
"website",
[
(
0,
"Bertold Brecht",
"https://en.wikipedia.org/wiki/Bertolt_Brecht",
1954,
)
],
),
("number_table", [(0, 4, -2)]),
],
)
def test_persist_replace_no_override(ip, test_table, expected_result):
saved_df_name = get_table_rows_as_dataframe(ip, table=test_table)
ip.run_cell(f"%sql --persist-replace sqlite:// {saved_df_name}")
out = ip.run_cell(f"%sql SELECT * FROM {saved_df_name}")
assert out.result == expected_result
assert out.error_in_exec is None


@pytest.mark.parametrize(
"first_test_table, second_test_table, expected_result",
[
("test", "author", [(0, "William", "Shakespeare", 1616)]),
("author", "test", [(0, 1, "foo")]),
("test", "number_table", [(0, 4, -2)]),
("number_table", "test", [(0, 1, "foo")]),
],
)
def test_persist_replace_override(
ip, first_test_table, second_test_table, expected_result
):
saved_df_name = "dummy_df_name"
table_df = get_table_rows_as_dataframe(
ip, table=first_test_table, name=saved_df_name
)
ip.run_cell(f"%sql --persist sqlite:// {table_df}")
table_df = get_table_rows_as_dataframe(
ip, table=second_test_table, name=saved_df_name
)
# To test the second --persist-replace executes successfully
persist_replace_out = ip.run_cell(f"%sql --persist-replace sqlite:// {table_df}")
assert persist_replace_out.error_in_exec is None

# To test the persisted data is from --persist
out = ip.run_cell(f"%sql SELECT * FROM {table_df}")
assert out.result == expected_result
assert out.error_in_exec is None


@pytest.mark.parametrize(
"first_test_table, second_test_table, expected_result",
[
("test", "author", [(0, 1, "foo")]),
("author", "test", [(0, "William", "Shakespeare", 1616)]),
("test", "number_table", [(0, 1, "foo")]),
("number_table", "test", [(0, 4, -2)]),
],
)
def test_persist_replace_override_reverted_order(
ip, first_test_table, second_test_table, expected_result
):
saved_df_name = "dummy_df_name"
table_df = get_table_rows_as_dataframe(
ip, table=first_test_table, name=saved_df_name
)
ip.run_cell(f"%sql --persist-replace sqlite:// {table_df}")
table_df = get_table_rows_as_dataframe(
ip, table=second_test_table, name=saved_df_name
)
persist_out = ip.run_cell(f"%sql --persist sqlite:// {table_df}")

# To test the second --persist executes not successfully
assert (
f"Table '{saved_df_name}' already exists. Consider using \
--persist-replace to drop the table before persisting the data frame"
in str(persist_out.error_in_exec)
)

out = ip.run_cell(f"%sql SELECT * FROM {table_df}")
# To test the persisted data is from --persist-replace
assert out.result == expected_result
assert out.error_in_exec is None


@pytest.mark.parametrize(
"test_table", [("test"), ("author"), ("website"), ("number_table")]
)
def test_persist_and_append_use_together(ip, test_table):
# Test error message when use --persist and --append together
saved_df_name = get_table_rows_as_dataframe(ip, table=test_table)
out = ip.run_cell(f"%sql --persist-replace --append sqlite:// {saved_df_name}")

assert """You cannot simultaneously persist and append data to a dataframe;
please choose to utilize either one or the other.""" in str(
out.error_in_exec
)
assert (out.error_in_exec.error_type) == "UsageError"


@pytest.mark.parametrize(
"test_table, expected_result",
[
("test", [(0, 1, "foo")]),
("author", [(0, "William", "Shakespeare", 1616)]),
(
"website",
[
(
0,
"Bertold Brecht",
"https://en.wikipedia.org/wiki/Bertolt_Brecht",
1954,
)
],
),
("number_table", [(0, 4, -2)]),
],
)
def test_persist_and_persist_replace_use_together(
ip, capsys, test_table, expected_result
):
# Test error message when use --persist and --persist-replace together
saved_df_name = get_table_rows_as_dataframe(ip, table=test_table)
# check UserWarning is raised
with pytest.warns(UserWarning) as w:
ip.run_cell(f"%sql --persist --persist-replace sqlite:// {saved_df_name}")

# check that the message matches
assert w[0].message.args[0] == "Please use either --persist or --persist-replace"

# Test persist-replace is used
execute_out = ip.run_cell(f"%sql SELECT * FROM {saved_df_name}")
assert execute_out.result == expected_result
assert execute_out.error_in_exec is None


@pytest.mark.parametrize(
"first_test_table, second_test_table, expected_result",
[
("test", "author", [(0, "William", "Shakespeare", 1616)]),
("author", "test", [(0, 1, "foo")]),
("test", "number_table", [(0, 4, -2)]),
("number_table", "test", [(0, 1, "foo")]),
],
)
def test_persist_replace_twice(
ip, first_test_table, second_test_table, expected_result
):
saved_df_name = "dummy_df_name"

table_df = get_table_rows_as_dataframe(
ip, table=first_test_table, name=saved_df_name
)
ip.run_cell(f"%sql --persist-replace sqlite:// {table_df}")

table_df = get_table_rows_as_dataframe(
ip, table=second_test_table, name=saved_df_name
)
ip.run_cell(f"%sql --persist-replace sqlite:// {table_df}")

out = ip.run_cell(f"%sql SELECT * FROM {table_df}")
# To test the persisted data is from --persist-replace
assert out.result == expected_result
assert out.error_in_exec is None


def test_connection_args_enforce_json(ip):
result = ip.run_cell('%sql --connection_arguments {"badlyformed":true')
assert result.error_in_exec
Expand Down
1 change: 1 addition & 0 deletions src/tests/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ def complete_with_defaults(mapping):
"creator": None,
"section": None,
"persist": False,
"persist_replace": False,
"no_index": False,
"append": False,
"connection_arguments": None,
Expand Down

0 comments on commit bb1bf16

Please sign in to comment.