From 3a8c7ccb39a48a567be71c78c23afc8261bb8fb1 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 6 Mar 2024 08:18:35 -0600 Subject: [PATCH] docs(snowflake): add blog post showing insertion into snowflake from postgres (#8426) --- .../index/execute-results/html.json | 16 ++ docs/posts/into-snowflake/index.qmd | 143 ++++++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 docs/_freeze/posts/into-snowflake/index/execute-results/html.json create mode 100644 docs/posts/into-snowflake/index.qmd diff --git a/docs/_freeze/posts/into-snowflake/index/execute-results/html.json b/docs/_freeze/posts/into-snowflake/index/execute-results/html.json new file mode 100644 index 000000000000..26ee27a86963 --- /dev/null +++ b/docs/_freeze/posts/into-snowflake/index/execute-results/html.json @@ -0,0 +1,16 @@ +{ + "hash": "b777fdee8d50ae460617f4078c6145bf", + "result": { + "engine": "jupyter", + "markdown": "---\ntitle: \"Snow IO: loading data from other DBs into Snowflake\"\nauthor: \"Phillip Cloud\"\nerror: false\ndate: \"2024-03-06\"\ncategories:\n - blog\n - snowflake\n - io\n - productivity\n---\n\n## Recap\n\nWe've [blogged about Snowflake IO before](../snowflake-io/index.qmd), in the\ncontext of getting local files into Snowflake as fast as possible.\n\nIn this post, we'll show how to insert query results from another system into\nSnowflake, using Ibis.\n\n## Setup\n\n### Connect to your non-Snowflake system\n\nWe'll connect to a postgres database running locally in a container. You\nshould be able to swap in your own connection details as needed.\n\n::: {#52dc2246 .cell execution_count=1}\n``` {.python .cell-code}\nfrom ibis.interactive import * # <1>\n\npg_con = ibis.connect(\"postgres://postgres:postgres@localhost/postgres\")\n```\n:::\n\n\n1. Import Ibis for maximum productivity in interactive analysis.\n\nWe'll use a test dataset that contains some baseball batting statistics.\n\nIbis provides that example data, so we can dump that into postgres.\n\n::: {#c9ed5f4c .cell execution_count=2}\n``` {.python .cell-code}\npg_batting = pg_con.create_table(\n \"batting\",\n ibis.examples.Batting.fetch().to_pandas(), # <1>\n temp=True, # <2>\n)\n```\n:::\n\n\n1. Yep, I'm using pandas here!\n2. Use a temporary table to avoid cluttering up the database.\n\n### Connect to Snowflake\n\n::: {#5f332c9a .cell execution_count=3}\n``` {.python .cell-code}\nimport os\n\n# snowflake://user:pass@account/database/schema?warehouse=my_warehouse\nsnow_con = ibis.connect(os.environ[\"SNOWFLAKE_URL\"]) # <1>\n```\n:::\n\n\n1. Set the `SNOWFLAKE_URL` environment variable to your Snowflake connection string.\n\n## Profit\n\n### Construct an Ibis expression from the postgres data\n\nLet's build an Ibis expression based on the `batting` table in our postgres database.\n\n::: {#842f6246 .cell execution_count=4}\n``` {.python .cell-code}\npg_batting\n```\n\n::: {.cell-output .cell-output-display execution_count=5}\n```{=html}\n
┏━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓\n┃ player_id ┃ year_id ┃ stint ┃ team_id ┃ lg_id ┃ g ┃ ab ┃ r ┃ h ┃ x2b ┃ x3b ┃ hr ┃ rbi ┃ sb ┃ cs ┃ bb ┃ so ┃ ibb ┃ hbp ┃ sh ┃ sf ┃ gidp ┃\n┡━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩\n│ string │ int64 │ int64 │ string │ string │ int64 │ int64 │ int64 │ int64 │ int64 │ int64 │ int64 │ float64 │ float64 │ float64 │ int64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │\n├───────────┼─────────┼───────┼─────────┼────────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼─────────┼─────────┼─────────┼───────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┤\n│ abercda01 │ 1871 │ 1 │ TRO │ NA │ 1 │ 4 │ 0 │ 0 │ 0 │ 0 │ 0 │ 0.0 │ 0.0 │ 0.0 │ 0 │ 0.0 │ nan │ nan │ nan │ nan │ 0.0 │\n│ addybo01 │ 1871 │ 1 │ RC1 │ NA │ 25 │ 118 │ 30 │ 32 │ 6 │ 0 │ 0 │ 13.0 │ 8.0 │ 1.0 │ 4 │ 0.0 │ nan │ nan │ nan │ nan │ 0.0 │\n│ allisar01 │ 1871 │ 1 │ CL1 │ NA │ 29 │ 137 │ 28 │ 40 │ 4 │ 5 │ 0 │ 19.0 │ 3.0 │ 1.0 │ 2 │ 5.0 │ nan │ nan │ nan │ nan │ 1.0 │\n│ allisdo01 │ 1871 │ 1 │ WS3 │ NA │ 27 │ 133 │ 28 │ 44 │ 10 │ 2 │ 2 │ 27.0 │ 1.0 │ 1.0 │ 0 │ 2.0 │ nan │ nan │ nan │ nan │ 0.0 │\n│ ansonca01 │ 1871 │ 1 │ RC1 │ NA │ 25 │ 120 │ 29 │ 39 │ 11 │ 3 │ 0 │ 16.0 │ 6.0 │ 2.0 │ 2 │ 1.0 │ nan │ nan │ nan │ nan │ 0.0 │\n│ armstbo01 │ 1871 │ 1 │ FW1 │ NA │ 12 │ 49 │ 9 │ 11 │ 2 │ 1 │ 0 │ 5.0 │ 0.0 │ 1.0 │ 0 │ 1.0 │ nan │ nan │ nan │ nan │ 0.0 │\n│ barkeal01 │ 1871 │ 1 │ RC1 │ NA │ 1 │ 4 │ 0 │ 1 │ 0 │ 0 │ 0 │ 2.0 │ 0.0 │ 0.0 │ 1 │ 0.0 │ nan │ nan │ nan │ nan │ 0.0 │\n│ barnero01 │ 1871 │ 1 │ BS1 │ NA │ 31 │ 157 │ 66 │ 63 │ 10 │ 9 │ 0 │ 34.0 │ 11.0 │ 6.0 │ 13 │ 1.0 │ nan │ nan │ nan │ nan │ 1.0 │\n│ barrebi01 │ 1871 │ 1 │ FW1 │ NA │ 1 │ 5 │ 1 │ 1 │ 1 │ 0 │ 0 │ 1.0 │ 0.0 │ 0.0 │ 0 │ 0.0 │ nan │ nan │ nan │ nan │ 0.0 │\n│ barrofr01 │ 1871 │ 1 │ BS1 │ NA │ 18 │ 86 │ 13 │ 13 │ 2 │ 1 │ 0 │ 11.0 │ 1.0 │ 0.0 │ 0 │ 0.0 │ nan │ nan │ nan │ nan │ 0.0 │\n│ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │\n└───────────┴─────────┴───────┴─────────┴────────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴─────────┴─────────┴─────────┴───────┴─────────┴─────────┴─────────┴─────────┴─────────┴─────────┘\n\n```\n:::\n:::\n\n\nWe can compute the average [RBI](https://en.wikipedia.org/wiki/Run_batted_in) per year per team.\n\n::: {#0fe95f00 .cell execution_count=5}\n``` {.python .cell-code}\npg_expr = pg_batting.group_by((\"year_id\", \"team_id\")).agg(avg_rbi=_.rbi.mean())\npg_expr\n```\n\n::: {.cell-output .cell-output-display execution_count=6}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┓\n┃ year_id ┃ team_id ┃ avg_rbi ┃\n┡━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━┩\n│ int64 │ string │ float64 │\n├─────────┼─────────┼───────────┤\n│ 1891 │ PIT │ 22.782609 │\n│ 1895 │ BSN │ 34.363636 │\n│ 1940 │ SLA │ 22.343750 │\n│ 1981 │ HOU │ 9.972973 │\n│ 1913 │ CLE │ 13.512821 │\n│ 1971 │ MON │ 17.181818 │\n│ 2008 │ PIT │ 15.000000 │\n│ 1895 │ WAS │ 23.096774 │\n│ 2011 │ KCA │ 16.785714 │\n│ 2007 │ MIL │ 19.350000 │\n│ … │ … │ … │\n└─────────┴─────────┴───────────┘\n\n```\n:::\n:::\n\n\nWe can also rename columns to be more consistent with typical Snowflake usage.\n\n::: {#c75c8ff3 .cell execution_count=6}\n``` {.python .cell-code}\npg_expr = pg_expr.rename(\"ALL_CAPS\")\npg_expr\n```\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┓\n┃ YEAR_ID ┃ TEAM_ID ┃ AVG_RBI ┃\n┡━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━┩\n│ int64 │ string │ float64 │\n├─────────┼─────────┼───────────┤\n│ 1891 │ PIT │ 22.782609 │\n│ 1895 │ BSN │ 34.363636 │\n│ 1940 │ SLA │ 22.343750 │\n│ 1981 │ HOU │ 9.972973 │\n│ 1913 │ CLE │ 13.512821 │\n│ 1971 │ MON │ 17.181818 │\n│ 2008 │ PIT │ 15.000000 │\n│ 1895 │ WAS │ 23.096774 │\n│ 2011 │ KCA │ 16.785714 │\n│ 2007 │ MIL │ 19.350000 │\n│ … │ … │ … │\n└─────────┴─────────┴───────────┘\n\n```\n:::\n:::\n\n\nLet's show how many rows we have in the result.\n\n::: {#ac4befe6 .cell execution_count=7}\n``` {.python .cell-code}\npg_expr.count()\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=8}\n\n::: {.ansi-escaped-output}\n```{=html}\n
3015
\n```\n:::\n\n:::\n:::\n\n\n### Insert the computed results into Snowflake\n\nBecause all Ibis backends implement the `to_pyarrow()` method, we can\nget data out of another system and into Snowflake with a few lines of code.\n\nFirst we'll create a table in Snowflake to hold the data.\n\nIbis helps here by providing an API to access the schema from the\n**postgres**-based expression, and automatically translates postgres types into\nSnowflake types.\n\n::: {#22568a53 .cell execution_count=8}\n``` {.python .cell-code}\nsnow_table = snow_con.create_table(\"pg_batting\", schema=pg_expr.schema(), temp=True) # <1>\n```\n:::\n\n\n1. By default the table will be created in the database and schema of the\n current connection.\n\n We create a temporary table for the same reason we do with postgres above.\n\n\nWe'll show that the table is empty to sanity check ourselves.\n\n::: {#dc8f71fa .cell execution_count=9}\n``` {.python .cell-code}\nsnow_table\n```\n\n::: {.cell-output .cell-output-display execution_count=10}\n```{=html}\n┏━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓\n┃ YEAR_ID ┃ TEAM_ID ┃ AVG_RBI ┃\n┡━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩\n│ int64 │ string │ float64 │\n└─────────┴─────────┴─────────┘\n\n```\n:::\n:::\n\n\nInsert the expression's result table into Snowflake.\n\n::: {#01bcdc29 .cell execution_count=10}\n``` {.python .cell-code}\nsnow_con.insert(\"pg_batting\", pg_expr.to_pyarrow())\n```\n:::\n\n\nTo sanity check what we've done let's peek at the table.\n\n::: {#e7a29528 .cell execution_count=11}\n``` {.python .cell-code}\nsnow_table\n```\n\n::: {.cell-output .cell-output-display execution_count=12}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┓\n┃ YEAR_ID ┃ TEAM_ID ┃ AVG_RBI ┃\n┡━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━┩\n│ int64 │ string │ float64 │\n├─────────┼─────────┼───────────┤\n│ 1891 │ PIT │ 22.782609 │\n│ 1895 │ BSN │ 34.363636 │\n│ 1940 │ SLA │ 22.343750 │\n│ 1981 │ HOU │ 9.972973 │\n│ 1913 │ CLE │ 13.512821 │\n│ 1971 │ MON │ 17.181818 │\n│ 2008 │ PIT │ 15.000000 │\n│ 1895 │ WAS │ 23.096774 │\n│ 2011 │ KCA │ 16.785714 │\n│ 2007 │ MIL │ 19.350000 │\n│ … │ … │ … │\n└─────────┴─────────┴───────────┘\n\n```\n:::\n:::\n\n\nWe'll count them too, to be extra sure.\n\n::: {#0b854a6c .cell execution_count=12}\n``` {.python .cell-code}\nsnow_table.count()\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=13}\n\n::: {.ansi-escaped-output}\n```{=html}\n
3015
\n```\n:::\n\n:::\n:::\n\n\n## Conclusion\n\nIn this post we show how easy it is to move data from one backend into Snowflake using Ibis.\n\nPlease try it out and get in touch on [Zulip](https://ibis-project.zulipchat.com/) or\n[GitHub](https://github.com/ibis-project/ibis), we'd love to hear from you!\n\n",
+ "supporting": [
+ "index_files"
+ ],
+ "filters": [],
+ "includes": {
+ "include-in-header": [
+ "\n\n\n"
+ ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/docs/posts/into-snowflake/index.qmd b/docs/posts/into-snowflake/index.qmd
new file mode 100644
index 000000000000..2fe6a6b85943
--- /dev/null
+++ b/docs/posts/into-snowflake/index.qmd
@@ -0,0 +1,143 @@
+---
+title: "Snow IO: loading data from other DBs into Snowflake"
+author: "Phillip Cloud"
+error: false
+date: "2024-03-06"
+categories:
+ - blog
+ - snowflake
+ - io
+ - productivity
+---
+
+## Recap
+
+We've [blogged about Snowflake IO before](../snowflake-io/index.qmd), in the
+context of getting local files into Snowflake as fast as possible.
+
+In this post, we'll show how to insert query results from another system into
+Snowflake, using Ibis.
+
+## Setup
+
+### Connect to your non-Snowflake system
+
+We'll connect to a postgres database running locally in a container. You
+should be able to swap in your own connection details as needed.
+
+```{python}
+from ibis.interactive import * # <1>
+
+pg_con = ibis.connect("postgres://postgres:postgres@localhost/postgres")
+```
+
+1. Import Ibis for maximum productivity in interactive analysis.
+
+We'll use a test dataset that contains some baseball batting statistics.
+
+Ibis provides that example data, so we can dump that into postgres.
+
+
+```{python}
+pg_batting = pg_con.create_table(
+ "batting",
+ ibis.examples.Batting.fetch().to_pandas(), # <1>
+ temp=True, # <2>
+)
+```
+
+1. Yep, I'm using pandas here!
+2. Use a temporary table to avoid cluttering up the database.
+
+### Connect to Snowflake
+
+```{python}
+import os
+
+# snowflake://user:pass@account/database/schema?warehouse=my_warehouse
+snow_con = ibis.connect(os.environ["SNOWFLAKE_URL"]) # <1>
+```
+
+1. Set the `SNOWFLAKE_URL` environment variable to your Snowflake connection string.
+
+## Profit
+
+### Construct an Ibis expression from the postgres data
+
+Let's build an Ibis expression based on the `batting` table in our postgres database.
+
+```{python}
+pg_batting
+```
+
+We can compute the average [RBI](https://en.wikipedia.org/wiki/Run_batted_in) per year per team.
+
+```{python}
+pg_expr = pg_batting.group_by(("year_id", "team_id")).agg(avg_rbi=_.rbi.mean())
+pg_expr
+```
+
+We can also rename columns to be more consistent with typical Snowflake usage.
+
+```{python}
+pg_expr = pg_expr.rename("ALL_CAPS")
+pg_expr
+```
+
+Let's show how many rows we have in the result.
+
+```{python}
+pg_expr.count()
+```
+
+### Insert the computed results into Snowflake
+
+Because all Ibis backends implement the `to_pyarrow()` method, we can
+get data out of another system and into Snowflake with a few lines of code.
+
+First we'll create a table in Snowflake to hold the data.
+
+Ibis helps here by providing an API to access the schema from the
+**postgres**-based expression, and automatically translates postgres types into
+Snowflake types.
+
+```{python}
+snow_table = snow_con.create_table("pg_batting", schema=pg_expr.schema(), temp=True) # <1>
+```
+
+1. By default the table will be created in the database and schema of the
+ current connection.
+
+ We create a temporary table for the same reason we do with postgres above.
+
+
+We'll show that the table is empty to sanity check ourselves.
+
+```{python}
+snow_table
+```
+
+Insert the expression's result table into Snowflake.
+
+```{python}
+snow_con.insert("pg_batting", pg_expr.to_pyarrow())
+```
+
+To sanity check what we've done let's peek at the table.
+
+```{python}
+snow_table
+```
+
+We'll count them too, to be extra sure.
+
+```{python}
+snow_table.count()
+```
+
+## Conclusion
+
+In this post we show how easy it is to move data from one backend into Snowflake using Ibis.
+
+Please try it out and get in touch on [Zulip](https://ibis-project.zulipchat.com/) or
+[GitHub](https://github.com/ibis-project/ibis), we'd love to hear from you!