--persist-replace argument (#440)

ploomber · May 27, 2023 · bb1bf16 · bb1bf16
1 parent 0a88f46
commit bb1bf16
Show file tree

Hide file tree

Showing 6 changed files with 288 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## 0.7.6dev
 
 * [Feature] Support for printing capture variables using `=<<` syntax (by [@jorisroovers](https://github.com/jorisroovers))
+* [Feature] Adds `--persist-replace` argument to replace existing tables when persisting data frames (#440)
 
 ## 0.7.5 (2023-05-24)
 

diff --git a/doc/integrations/pandas.md b/doc/integrations/pandas.md
@@ -1,8 +1,8 @@
 ---
 jupytext:
-  notebook_metadata_filter: myst
   cell_metadata_filter: -all
   formats: md:myst
+  notebook_metadata_filter: myst
   text_representation:
     extension: .md
     format_name: myst
@@ -14,7 +14,8 @@ kernelspec:
   name: python3
 myst:
   html_meta:
-    description lang=en: Convert outputs from SQL queries to pandas data frames using JupySQL
+    description lang=en: Convert outputs from SQL queries to pandas data frames using
+      JupySQL
     keywords: jupyter, sql, jupysql, pandas
     property=og:locale: en_US
 ---
@@ -86,7 +87,9 @@ df
 
 +++
 
-The `--persist` argument, with the name of a  DataFrame object in memory, 
+### `--persist`
+
+The `--persist` argument, with the name of a DataFrame object in memory, 
 will create a table name in the database from the named DataFrame.   Or use `--append` to add rows to an existing  table by that name.
 
 ```{code-cell} ipython3
@@ -97,6 +100,27 @@ will create a table name in the database from the named DataFrame.   Or use `--a
 %sql SELECT * FROM df;
 ```
 
+### `--persist-replace`
+
+The `--persist-replace` performs the similiar functionaility with `--persist`,
+but it will drop the existing table before inserting the new table
+
+#### Declare the dataframe again
+
+```{code-cell} ipython3
+df = %sql SELECT * FROM writer LIMIT 1
+df
+```
+
+#### Use `--persist-replace`
+
 ```{code-cell} ipython3
+%sql --persist-replace df
+```
 
+#### df table is overridden
+
+```{code-cell} ipython3
+%sql SELECT * FROM df;
 ```
+
diff --git a/src/sql/magic.py b/src/sql/magic.py
@@ -202,6 +202,12 @@ def _mutex_autopandas_autopolars(self, change):
         action="store_true",
         help="create a table name in the database from the named DataFrame",
     )
+    @argument(
+        "-P",
+        "--persist-replace",
+        action="store_true",
+        help="replace the DataFrame if it exists, otherwise perform --persist",
+    )
     @argument(
         "-n",
         "--no-index",
@@ -367,11 +373,34 @@ def interactive_execute_wrapper(**kwargs):
             alias=args.alias,
         )
         payload["connection_info"] = conn._get_curr_sqlalchemy_connection_info()
-        if args.persist:
+        if args.persist_replace and args.append:
+            raise exceptions.UsageError(
+                """You cannot simultaneously persist and append data to a dataframe;
+                  please choose to utilize either one or the other."""
+            )
+        if args.persist and args.persist_replace:
+            warnings.warn("Please use either --persist or --persist-replace")
+            return self._persist_dataframe(
+                command.sql,
+                conn,
+                user_ns,
+                append=False,
+                index=not args.no_index,
+                replace=True,
+            )
+        elif args.persist:
             return self._persist_dataframe(
                 command.sql, conn, user_ns, append=False, index=not args.no_index
             )
-
+        elif args.persist_replace:
+            return self._persist_dataframe(
+                command.sql,
+                conn,
+                user_ns,
+                append=False,
+                index=not args.no_index,
+                replace=True,
+            )
         if args.append:
             return self._persist_dataframe(
                 command.sql, conn, user_ns, append=True, index=not args.no_index
@@ -449,7 +478,9 @@ def interactive_execute_wrapper(**kwargs):
     legal_sql_identifier = re.compile(r"^[A-Za-z0-9#_$]+")
 
     @modify_exceptions
-    def _persist_dataframe(self, raw, conn, user_ns, append=False, index=True):
+    def _persist_dataframe(
+        self, raw, conn, user_ns, append=False, index=True, replace=False
+    ):
         """Implements PERSIST, which writes a DataFrame to the RDBMS"""
         if not DataFrame:
             raise exceptions.MissingPackageError(
@@ -488,14 +519,22 @@ def _persist_dataframe(self, raw, conn, user_ns, append=False, index=True):
         table_name = frame_name.lower()
         table_name = self.legal_sql_identifier.search(table_name).group(0)
 
-        if_exists = "append" if append else "fail"
+        if replace:
+            if_exists = "replace"
+        elif append:
+            if_exists = "append"
+        else:
+            if_exists = "fail"
 
         try:
             frame.to_sql(
                 table_name, conn.session.engine, if_exists=if_exists, index=index
             )
-        except ValueError as e:
-            raise exceptions.ValueError(e) from e
+        except ValueError:
+            raise exceptions.ValueError(
+                f"""Table {table_name!r} already exists. Consider using \
+--persist-replace to drop the table before persisting the data frame"""
+            )
 
         return "Persisted %s" % table_name
 

diff --git a/src/tests/test_command.py b/src/tests/test_command.py
@@ -162,6 +162,7 @@ def test_args(ip, sql_magic):
         "creator": None,
         "section": None,
         "persist": False,
+        "persist_replace": False,
         "no_index": False,
         "append": False,
         "connection_arguments": None,

diff --git a/src/tests/test_magic.py b/src/tests/test_magic.py
@@ -231,6 +231,219 @@ def test_persist_bare(ip):
     assert result.error_in_exec
 
 
+def get_table_rows_as_dataframe(ip, table, name=None):
+    """The function will generate the pandas dataframe in the namespace
+    by querying the data by given table name"""
+    if name:
+        saved_df_name = name
+    else:
+        saved_df_name = f"df_{table}"
+    ip.run_cell(f"results = %sql SELECT * FROM {table} LIMIT 1;")
+    ip.run_cell(f"{saved_df_name} = results.DataFrame()")
+    return saved_df_name
+
+
+@pytest.mark.parametrize(
+    "test_table, expected_result",
+    [
+        ("test", [(0, 1, "foo")]),
+        ("author", [(0, "William", "Shakespeare", 1616)]),
+        (
+            "website",
+            [
+                (
+                    0,
+                    "Bertold Brecht",
+                    "https://en.wikipedia.org/wiki/Bertolt_Brecht",
+                    1954,
+                )
+            ],
+        ),
+        ("number_table", [(0, 4, -2)]),
+    ],
+)
+def test_persist_replace_abbr_no_override(ip, test_table, expected_result):
+    saved_df_name = get_table_rows_as_dataframe(ip, table=test_table)
+    ip.run_cell(f"%sql -P sqlite:// {saved_df_name}")
+    out = ip.run_cell(f"%sql SELECT * FROM {saved_df_name}")
+    assert out.result == expected_result
+    assert out.error_in_exec is None
+
+
+@pytest.mark.parametrize(
+    "test_table, expected_result",
+    [
+        ("test", [(0, 1, "foo")]),
+        ("author", [(0, "William", "Shakespeare", 1616)]),
+        (
+            "website",
+            [
+                (
+                    0,
+                    "Bertold Brecht",
+                    "https://en.wikipedia.org/wiki/Bertolt_Brecht",
+                    1954,
+                )
+            ],
+        ),
+        ("number_table", [(0, 4, -2)]),
+    ],
+)
+def test_persist_replace_no_override(ip, test_table, expected_result):
+    saved_df_name = get_table_rows_as_dataframe(ip, table=test_table)
+    ip.run_cell(f"%sql --persist-replace sqlite:// {saved_df_name}")
+    out = ip.run_cell(f"%sql SELECT * FROM {saved_df_name}")
+    assert out.result == expected_result
+    assert out.error_in_exec is None
+
+
+@pytest.mark.parametrize(
+    "first_test_table, second_test_table, expected_result",
+    [
+        ("test", "author", [(0, "William", "Shakespeare", 1616)]),
+        ("author", "test", [(0, 1, "foo")]),
+        ("test", "number_table", [(0, 4, -2)]),
+        ("number_table", "test", [(0, 1, "foo")]),
+    ],
+)
+def test_persist_replace_override(
+    ip, first_test_table, second_test_table, expected_result
+):
+    saved_df_name = "dummy_df_name"
+    table_df = get_table_rows_as_dataframe(
+        ip, table=first_test_table, name=saved_df_name
+    )
+    ip.run_cell(f"%sql --persist sqlite:// {table_df}")
+    table_df = get_table_rows_as_dataframe(
+        ip, table=second_test_table, name=saved_df_name
+    )
+    # To test the second --persist-replace executes successfully
+    persist_replace_out = ip.run_cell(f"%sql --persist-replace sqlite:// {table_df}")
+    assert persist_replace_out.error_in_exec is None
+
+    # To test the persisted data is from --persist
+    out = ip.run_cell(f"%sql SELECT * FROM {table_df}")
+    assert out.result == expected_result
+    assert out.error_in_exec is None
+
+
+@pytest.mark.parametrize(
+    "first_test_table, second_test_table, expected_result",
+    [
+        ("test", "author", [(0, 1, "foo")]),
+        ("author", "test", [(0, "William", "Shakespeare", 1616)]),
+        ("test", "number_table", [(0, 1, "foo")]),
+        ("number_table", "test", [(0, 4, -2)]),
+    ],
+)
+def test_persist_replace_override_reverted_order(
+    ip, first_test_table, second_test_table, expected_result
+):
+    saved_df_name = "dummy_df_name"
+    table_df = get_table_rows_as_dataframe(
+        ip, table=first_test_table, name=saved_df_name
+    )
+    ip.run_cell(f"%sql --persist-replace sqlite:// {table_df}")
+    table_df = get_table_rows_as_dataframe(
+        ip, table=second_test_table, name=saved_df_name
+    )
+    persist_out = ip.run_cell(f"%sql --persist sqlite:// {table_df}")
+
+    # To test the second --persist executes not successfully
+    assert (
+        f"Table '{saved_df_name}' already exists. Consider using \
+--persist-replace to drop the table before persisting the data frame"
+        in str(persist_out.error_in_exec)
+    )
+
+    out = ip.run_cell(f"%sql SELECT * FROM {table_df}")
+    # To test the persisted data is from --persist-replace
+    assert out.result == expected_result
+    assert out.error_in_exec is None
+
+
+@pytest.mark.parametrize(
+    "test_table", [("test"), ("author"), ("website"), ("number_table")]
+)
+def test_persist_and_append_use_together(ip, test_table):
+    # Test error message when use --persist and --append together
+    saved_df_name = get_table_rows_as_dataframe(ip, table=test_table)
+    out = ip.run_cell(f"%sql --persist-replace --append sqlite:// {saved_df_name}")
+
+    assert """You cannot simultaneously persist and append data to a dataframe;
+                  please choose to utilize either one or the other.""" in str(
+        out.error_in_exec
+    )
+    assert (out.error_in_exec.error_type) == "UsageError"
+
+
+@pytest.mark.parametrize(
+    "test_table, expected_result",
+    [
+        ("test", [(0, 1, "foo")]),
+        ("author", [(0, "William", "Shakespeare", 1616)]),
+        (
+            "website",
+            [
+                (
+                    0,
+                    "Bertold Brecht",
+                    "https://en.wikipedia.org/wiki/Bertolt_Brecht",
+                    1954,
+                )
+            ],
+        ),
+        ("number_table", [(0, 4, -2)]),
+    ],
+)
+def test_persist_and_persist_replace_use_together(
+    ip, capsys, test_table, expected_result
+):
+    # Test error message when use --persist and --persist-replace together
+    saved_df_name = get_table_rows_as_dataframe(ip, table=test_table)
+    # check UserWarning is raised
+    with pytest.warns(UserWarning) as w:
+        ip.run_cell(f"%sql --persist --persist-replace sqlite:// {saved_df_name}")
+
+    # check that the message matches
+    assert w[0].message.args[0] == "Please use either --persist or --persist-replace"
+
+    # Test persist-replace is used
+    execute_out = ip.run_cell(f"%sql SELECT * FROM {saved_df_name}")
+    assert execute_out.result == expected_result
+    assert execute_out.error_in_exec is None
+
+
+@pytest.mark.parametrize(
+    "first_test_table, second_test_table, expected_result",
+    [
+        ("test", "author", [(0, "William", "Shakespeare", 1616)]),
+        ("author", "test", [(0, 1, "foo")]),
+        ("test", "number_table", [(0, 4, -2)]),
+        ("number_table", "test", [(0, 1, "foo")]),
+    ],
+)
+def test_persist_replace_twice(
+    ip, first_test_table, second_test_table, expected_result
+):
+    saved_df_name = "dummy_df_name"
+
+    table_df = get_table_rows_as_dataframe(
+        ip, table=first_test_table, name=saved_df_name
+    )
+    ip.run_cell(f"%sql --persist-replace sqlite:// {table_df}")
+
+    table_df = get_table_rows_as_dataframe(
+        ip, table=second_test_table, name=saved_df_name
+    )
+    ip.run_cell(f"%sql --persist-replace sqlite:// {table_df}")
+
+    out = ip.run_cell(f"%sql SELECT * FROM {table_df}")
+    # To test the persisted data is from --persist-replace
+    assert out.result == expected_result
+    assert out.error_in_exec is None
+
+
 def test_connection_args_enforce_json(ip):
     result = ip.run_cell('%sql --connection_arguments {"badlyformed":true')
     assert result.error_in_exec

diff --git a/src/tests/test_parse.py b/src/tests/test_parse.py
@@ -203,6 +203,7 @@ def complete_with_defaults(mapping):
         "creator": None,
         "section": None,
         "persist": False,
+        "persist_replace": False,
         "no_index": False,
         "append": False,
         "connection_arguments": None,