From bb1bf165c08a1c8ca8b423edef1579fffe744ca4 Mon Sep 17 00:00:00 2001 From: Tony Kuo <123580782+tonykploomber@users.noreply.github.com> Date: Sun, 28 May 2023 00:36:11 +0800 Subject: [PATCH] --persist-replace argument (#440) --- CHANGELOG.md | 1 + doc/integrations/pandas.md | 30 +++++- src/sql/magic.py | 51 +++++++-- src/tests/test_command.py | 1 + src/tests/test_magic.py | 213 +++++++++++++++++++++++++++++++++++++ src/tests/test_parse.py | 1 + 6 files changed, 288 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ff888439a..2ec53e7ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## 0.7.6dev * [Feature] Support for printing capture variables using `=<<` syntax (by [@jorisroovers](https://github.com/jorisroovers)) +* [Feature] Adds `--persist-replace` argument to replace existing tables when persisting data frames (#440) ## 0.7.5 (2023-05-24) diff --git a/doc/integrations/pandas.md b/doc/integrations/pandas.md index 631769c74..c569f96cf 100644 --- a/doc/integrations/pandas.md +++ b/doc/integrations/pandas.md @@ -1,8 +1,8 @@ --- jupytext: - notebook_metadata_filter: myst cell_metadata_filter: -all formats: md:myst + notebook_metadata_filter: myst text_representation: extension: .md format_name: myst @@ -14,7 +14,8 @@ kernelspec: name: python3 myst: html_meta: - description lang=en: Convert outputs from SQL queries to pandas data frames using JupySQL + description lang=en: Convert outputs from SQL queries to pandas data frames using + JupySQL keywords: jupyter, sql, jupysql, pandas property=og:locale: en_US --- @@ -86,7 +87,9 @@ df +++ -The `--persist` argument, with the name of a DataFrame object in memory, +### `--persist` + +The `--persist` argument, with the name of a DataFrame object in memory, will create a table name in the database from the named DataFrame. Or use `--append` to add rows to an existing table by that name. ```{code-cell} ipython3 @@ -97,6 +100,27 @@ will create a table name in the database from the named DataFrame. Or use `--a %sql SELECT * FROM df; ``` +### `--persist-replace` + +The `--persist-replace` performs the similiar functionaility with `--persist`, +but it will drop the existing table before inserting the new table + +#### Declare the dataframe again + +```{code-cell} ipython3 +df = %sql SELECT * FROM writer LIMIT 1 +df +``` + +#### Use `--persist-replace` + ```{code-cell} ipython3 +%sql --persist-replace df +``` +#### df table is overridden + +```{code-cell} ipython3 +%sql SELECT * FROM df; ``` + diff --git a/src/sql/magic.py b/src/sql/magic.py index 8543fa2c6..f32587f97 100644 --- a/src/sql/magic.py +++ b/src/sql/magic.py @@ -202,6 +202,12 @@ def _mutex_autopandas_autopolars(self, change): action="store_true", help="create a table name in the database from the named DataFrame", ) + @argument( + "-P", + "--persist-replace", + action="store_true", + help="replace the DataFrame if it exists, otherwise perform --persist", + ) @argument( "-n", "--no-index", @@ -367,11 +373,34 @@ def interactive_execute_wrapper(**kwargs): alias=args.alias, ) payload["connection_info"] = conn._get_curr_sqlalchemy_connection_info() - if args.persist: + if args.persist_replace and args.append: + raise exceptions.UsageError( + """You cannot simultaneously persist and append data to a dataframe; + please choose to utilize either one or the other.""" + ) + if args.persist and args.persist_replace: + warnings.warn("Please use either --persist or --persist-replace") + return self._persist_dataframe( + command.sql, + conn, + user_ns, + append=False, + index=not args.no_index, + replace=True, + ) + elif args.persist: return self._persist_dataframe( command.sql, conn, user_ns, append=False, index=not args.no_index ) - + elif args.persist_replace: + return self._persist_dataframe( + command.sql, + conn, + user_ns, + append=False, + index=not args.no_index, + replace=True, + ) if args.append: return self._persist_dataframe( command.sql, conn, user_ns, append=True, index=not args.no_index @@ -449,7 +478,9 @@ def interactive_execute_wrapper(**kwargs): legal_sql_identifier = re.compile(r"^[A-Za-z0-9#_$]+") @modify_exceptions - def _persist_dataframe(self, raw, conn, user_ns, append=False, index=True): + def _persist_dataframe( + self, raw, conn, user_ns, append=False, index=True, replace=False + ): """Implements PERSIST, which writes a DataFrame to the RDBMS""" if not DataFrame: raise exceptions.MissingPackageError( @@ -488,14 +519,22 @@ def _persist_dataframe(self, raw, conn, user_ns, append=False, index=True): table_name = frame_name.lower() table_name = self.legal_sql_identifier.search(table_name).group(0) - if_exists = "append" if append else "fail" + if replace: + if_exists = "replace" + elif append: + if_exists = "append" + else: + if_exists = "fail" try: frame.to_sql( table_name, conn.session.engine, if_exists=if_exists, index=index ) - except ValueError as e: - raise exceptions.ValueError(e) from e + except ValueError: + raise exceptions.ValueError( + f"""Table {table_name!r} already exists. Consider using \ +--persist-replace to drop the table before persisting the data frame""" + ) return "Persisted %s" % table_name diff --git a/src/tests/test_command.py b/src/tests/test_command.py index 8bd4cb878..7b73b14d5 100644 --- a/src/tests/test_command.py +++ b/src/tests/test_command.py @@ -162,6 +162,7 @@ def test_args(ip, sql_magic): "creator": None, "section": None, "persist": False, + "persist_replace": False, "no_index": False, "append": False, "connection_arguments": None, diff --git a/src/tests/test_magic.py b/src/tests/test_magic.py index 65dd0515c..37456ab4f 100644 --- a/src/tests/test_magic.py +++ b/src/tests/test_magic.py @@ -231,6 +231,219 @@ def test_persist_bare(ip): assert result.error_in_exec +def get_table_rows_as_dataframe(ip, table, name=None): + """The function will generate the pandas dataframe in the namespace + by querying the data by given table name""" + if name: + saved_df_name = name + else: + saved_df_name = f"df_{table}" + ip.run_cell(f"results = %sql SELECT * FROM {table} LIMIT 1;") + ip.run_cell(f"{saved_df_name} = results.DataFrame()") + return saved_df_name + + +@pytest.mark.parametrize( + "test_table, expected_result", + [ + ("test", [(0, 1, "foo")]), + ("author", [(0, "William", "Shakespeare", 1616)]), + ( + "website", + [ + ( + 0, + "Bertold Brecht", + "https://en.wikipedia.org/wiki/Bertolt_Brecht", + 1954, + ) + ], + ), + ("number_table", [(0, 4, -2)]), + ], +) +def test_persist_replace_abbr_no_override(ip, test_table, expected_result): + saved_df_name = get_table_rows_as_dataframe(ip, table=test_table) + ip.run_cell(f"%sql -P sqlite:// {saved_df_name}") + out = ip.run_cell(f"%sql SELECT * FROM {saved_df_name}") + assert out.result == expected_result + assert out.error_in_exec is None + + +@pytest.mark.parametrize( + "test_table, expected_result", + [ + ("test", [(0, 1, "foo")]), + ("author", [(0, "William", "Shakespeare", 1616)]), + ( + "website", + [ + ( + 0, + "Bertold Brecht", + "https://en.wikipedia.org/wiki/Bertolt_Brecht", + 1954, + ) + ], + ), + ("number_table", [(0, 4, -2)]), + ], +) +def test_persist_replace_no_override(ip, test_table, expected_result): + saved_df_name = get_table_rows_as_dataframe(ip, table=test_table) + ip.run_cell(f"%sql --persist-replace sqlite:// {saved_df_name}") + out = ip.run_cell(f"%sql SELECT * FROM {saved_df_name}") + assert out.result == expected_result + assert out.error_in_exec is None + + +@pytest.mark.parametrize( + "first_test_table, second_test_table, expected_result", + [ + ("test", "author", [(0, "William", "Shakespeare", 1616)]), + ("author", "test", [(0, 1, "foo")]), + ("test", "number_table", [(0, 4, -2)]), + ("number_table", "test", [(0, 1, "foo")]), + ], +) +def test_persist_replace_override( + ip, first_test_table, second_test_table, expected_result +): + saved_df_name = "dummy_df_name" + table_df = get_table_rows_as_dataframe( + ip, table=first_test_table, name=saved_df_name + ) + ip.run_cell(f"%sql --persist sqlite:// {table_df}") + table_df = get_table_rows_as_dataframe( + ip, table=second_test_table, name=saved_df_name + ) + # To test the second --persist-replace executes successfully + persist_replace_out = ip.run_cell(f"%sql --persist-replace sqlite:// {table_df}") + assert persist_replace_out.error_in_exec is None + + # To test the persisted data is from --persist + out = ip.run_cell(f"%sql SELECT * FROM {table_df}") + assert out.result == expected_result + assert out.error_in_exec is None + + +@pytest.mark.parametrize( + "first_test_table, second_test_table, expected_result", + [ + ("test", "author", [(0, 1, "foo")]), + ("author", "test", [(0, "William", "Shakespeare", 1616)]), + ("test", "number_table", [(0, 1, "foo")]), + ("number_table", "test", [(0, 4, -2)]), + ], +) +def test_persist_replace_override_reverted_order( + ip, first_test_table, second_test_table, expected_result +): + saved_df_name = "dummy_df_name" + table_df = get_table_rows_as_dataframe( + ip, table=first_test_table, name=saved_df_name + ) + ip.run_cell(f"%sql --persist-replace sqlite:// {table_df}") + table_df = get_table_rows_as_dataframe( + ip, table=second_test_table, name=saved_df_name + ) + persist_out = ip.run_cell(f"%sql --persist sqlite:// {table_df}") + + # To test the second --persist executes not successfully + assert ( + f"Table '{saved_df_name}' already exists. Consider using \ +--persist-replace to drop the table before persisting the data frame" + in str(persist_out.error_in_exec) + ) + + out = ip.run_cell(f"%sql SELECT * FROM {table_df}") + # To test the persisted data is from --persist-replace + assert out.result == expected_result + assert out.error_in_exec is None + + +@pytest.mark.parametrize( + "test_table", [("test"), ("author"), ("website"), ("number_table")] +) +def test_persist_and_append_use_together(ip, test_table): + # Test error message when use --persist and --append together + saved_df_name = get_table_rows_as_dataframe(ip, table=test_table) + out = ip.run_cell(f"%sql --persist-replace --append sqlite:// {saved_df_name}") + + assert """You cannot simultaneously persist and append data to a dataframe; + please choose to utilize either one or the other.""" in str( + out.error_in_exec + ) + assert (out.error_in_exec.error_type) == "UsageError" + + +@pytest.mark.parametrize( + "test_table, expected_result", + [ + ("test", [(0, 1, "foo")]), + ("author", [(0, "William", "Shakespeare", 1616)]), + ( + "website", + [ + ( + 0, + "Bertold Brecht", + "https://en.wikipedia.org/wiki/Bertolt_Brecht", + 1954, + ) + ], + ), + ("number_table", [(0, 4, -2)]), + ], +) +def test_persist_and_persist_replace_use_together( + ip, capsys, test_table, expected_result +): + # Test error message when use --persist and --persist-replace together + saved_df_name = get_table_rows_as_dataframe(ip, table=test_table) + # check UserWarning is raised + with pytest.warns(UserWarning) as w: + ip.run_cell(f"%sql --persist --persist-replace sqlite:// {saved_df_name}") + + # check that the message matches + assert w[0].message.args[0] == "Please use either --persist or --persist-replace" + + # Test persist-replace is used + execute_out = ip.run_cell(f"%sql SELECT * FROM {saved_df_name}") + assert execute_out.result == expected_result + assert execute_out.error_in_exec is None + + +@pytest.mark.parametrize( + "first_test_table, second_test_table, expected_result", + [ + ("test", "author", [(0, "William", "Shakespeare", 1616)]), + ("author", "test", [(0, 1, "foo")]), + ("test", "number_table", [(0, 4, -2)]), + ("number_table", "test", [(0, 1, "foo")]), + ], +) +def test_persist_replace_twice( + ip, first_test_table, second_test_table, expected_result +): + saved_df_name = "dummy_df_name" + + table_df = get_table_rows_as_dataframe( + ip, table=first_test_table, name=saved_df_name + ) + ip.run_cell(f"%sql --persist-replace sqlite:// {table_df}") + + table_df = get_table_rows_as_dataframe( + ip, table=second_test_table, name=saved_df_name + ) + ip.run_cell(f"%sql --persist-replace sqlite:// {table_df}") + + out = ip.run_cell(f"%sql SELECT * FROM {table_df}") + # To test the persisted data is from --persist-replace + assert out.result == expected_result + assert out.error_in_exec is None + + def test_connection_args_enforce_json(ip): result = ip.run_cell('%sql --connection_arguments {"badlyformed":true') assert result.error_in_exec diff --git a/src/tests/test_parse.py b/src/tests/test_parse.py index f124ae89e..137e40947 100644 --- a/src/tests/test_parse.py +++ b/src/tests/test_parse.py @@ -203,6 +203,7 @@ def complete_with_defaults(mapping): "creator": None, "section": None, "persist": False, + "persist_replace": False, "no_index": False, "append": False, "connection_arguments": None,