From 3a8c7ccb39a48a567be71c78c23afc8261bb8fb1 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 6 Mar 2024 08:18:35 -0600 Subject: [PATCH] docs(snowflake): add blog post showing insertion into snowflake from postgres (#8426) --- .../index/execute-results/html.json | 16 ++ docs/posts/into-snowflake/index.qmd | 143 ++++++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 docs/_freeze/posts/into-snowflake/index/execute-results/html.json create mode 100644 docs/posts/into-snowflake/index.qmd diff --git a/docs/_freeze/posts/into-snowflake/index/execute-results/html.json b/docs/_freeze/posts/into-snowflake/index/execute-results/html.json new file mode 100644 index 000000000000..26ee27a86963 --- /dev/null +++ b/docs/_freeze/posts/into-snowflake/index/execute-results/html.json @@ -0,0 +1,16 @@ +{ + "hash": "b777fdee8d50ae460617f4078c6145bf", + "result": { + "engine": "jupyter", + "markdown": "---\ntitle: \"Snow IO: loading data from other DBs into Snowflake\"\nauthor: \"Phillip Cloud\"\nerror: false\ndate: \"2024-03-06\"\ncategories:\n - blog\n - snowflake\n - io\n - productivity\n---\n\n## Recap\n\nWe've [blogged about Snowflake IO before](../snowflake-io/index.qmd), in the\ncontext of getting local files into Snowflake as fast as possible.\n\nIn this post, we'll show how to insert query results from another system into\nSnowflake, using Ibis.\n\n## Setup\n\n### Connect to your non-Snowflake system\n\nWe'll connect to a postgres database running locally in a container. You\nshould be able to swap in your own connection details as needed.\n\n::: {#52dc2246 .cell execution_count=1}\n``` {.python .cell-code}\nfrom ibis.interactive import * # <1>\n\npg_con = ibis.connect(\"postgres://postgres:postgres@localhost/postgres\")\n```\n:::\n\n\n1. Import Ibis for maximum productivity in interactive analysis.\n\nWe'll use a test dataset that contains some baseball batting statistics.\n\nIbis provides that example data, so we can dump that into postgres.\n\n::: {#c9ed5f4c .cell execution_count=2}\n``` {.python .cell-code}\npg_batting = pg_con.create_table(\n \"batting\",\n ibis.examples.Batting.fetch().to_pandas(), # <1>\n temp=True, # <2>\n)\n```\n:::\n\n\n1. Yep, I'm using pandas here!\n2. Use a temporary table to avoid cluttering up the database.\n\n### Connect to Snowflake\n\n::: {#5f332c9a .cell execution_count=3}\n``` {.python .cell-code}\nimport os\n\n# snowflake://user:pass@account/database/schema?warehouse=my_warehouse\nsnow_con = ibis.connect(os.environ[\"SNOWFLAKE_URL\"]) # <1>\n```\n:::\n\n\n1. Set the `SNOWFLAKE_URL` environment variable to your Snowflake connection string.\n\n## Profit\n\n### Construct an Ibis expression from the postgres data\n\nLet's build an Ibis expression based on the `batting` table in our postgres database.\n\n::: {#842f6246 .cell execution_count=4}\n``` {.python .cell-code}\npg_batting\n```\n\n::: {.cell-output .cell-output-display execution_count=5}\n```{=html}\n
┏━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓\n┃ player_id  year_id  stint  team_id  lg_id   g      ab     r      h      x2b    x3b    hr     rbi      sb       cs       bb     so       ibb      hbp      sh       sf       gidp    ┃\n┡━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩\n│ stringint64int64stringstringint64int64int64int64int64int64int64float64float64float64int64float64float64float64float64float64float64 │\n├───────────┼─────────┼───────┼─────────┼────────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼─────────┼─────────┼─────────┼───────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┤\n│ abercda0118711TRO    NA    14000000.00.00.000.0nannannannan0.0 │\n│ addybo01 18711RC1    NA    25118303260013.08.01.040.0nannannannan0.0 │\n│ allisar0118711CL1    NA    29137284045019.03.01.025.0nannannannan1.0 │\n│ allisdo0118711WS3    NA    271332844102227.01.01.002.0nannannannan0.0 │\n│ ansonca0118711RC1    NA    251202939113016.06.02.021.0nannannannan0.0 │\n│ armstbo0118711FW1    NA    12499112105.00.01.001.0nannannannan0.0 │\n│ barkeal0118711RC1    NA    14010002.00.00.010.0nannannannan0.0 │\n│ barnero0118711BS1    NA    311576663109034.011.06.0131.0nannannannan1.0 │\n│ barrebi0118711FW1    NA    15111001.00.00.000.0nannannannan0.0 │\n│ barrofr0118711BS1    NA    1886131321011.01.00.000.0nannannannan0.0 │\n│  │\n└───────────┴─────────┴───────┴─────────┴────────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴─────────┴─────────┴─────────┴───────┴─────────┴─────────┴─────────┴─────────┴─────────┴─────────┘\n
\n```\n:::\n:::\n\n\nWe can compute the average [RBI](https://en.wikipedia.org/wiki/Run_batted_in) per year per team.\n\n::: {#0fe95f00 .cell execution_count=5}\n``` {.python .cell-code}\npg_expr = pg_batting.group_by((\"year_id\", \"team_id\")).agg(avg_rbi=_.rbi.mean())\npg_expr\n```\n\n::: {.cell-output .cell-output-display execution_count=6}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┓\n┃ year_id  team_id  avg_rbi   ┃\n┡━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━┩\n│ int64stringfloat64   │\n├─────────┼─────────┼───────────┤\n│    1891PIT    22.782609 │\n│    1895BSN    34.363636 │\n│    1940SLA    22.343750 │\n│    1981HOU    9.972973 │\n│    1913CLE    13.512821 │\n│    1971MON    17.181818 │\n│    2008PIT    15.000000 │\n│    1895WAS    23.096774 │\n│    2011KCA    16.785714 │\n│    2007MIL    19.350000 │\n│        │\n└─────────┴─────────┴───────────┘\n
\n```\n:::\n:::\n\n\nWe can also rename columns to be more consistent with typical Snowflake usage.\n\n::: {#c75c8ff3 .cell execution_count=6}\n``` {.python .cell-code}\npg_expr = pg_expr.rename(\"ALL_CAPS\")\npg_expr\n```\n\n::: {.cell-output .cell-output-display execution_count=7}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┓\n┃ YEAR_ID  TEAM_ID  AVG_RBI   ┃\n┡━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━┩\n│ int64stringfloat64   │\n├─────────┼─────────┼───────────┤\n│    1891PIT    22.782609 │\n│    1895BSN    34.363636 │\n│    1940SLA    22.343750 │\n│    1981HOU    9.972973 │\n│    1913CLE    13.512821 │\n│    1971MON    17.181818 │\n│    2008PIT    15.000000 │\n│    1895WAS    23.096774 │\n│    2011KCA    16.785714 │\n│    2007MIL    19.350000 │\n│        │\n└─────────┴─────────┴───────────┘\n
\n```\n:::\n:::\n\n\nLet's show how many rows we have in the result.\n\n::: {#ac4befe6 .cell execution_count=7}\n``` {.python .cell-code}\npg_expr.count()\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n
\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=8}\n\n::: {.ansi-escaped-output}\n```{=html}\n
3015
\n```\n:::\n\n:::\n:::\n\n\n### Insert the computed results into Snowflake\n\nBecause all Ibis backends implement the `to_pyarrow()` method, we can\nget data out of another system and into Snowflake with a few lines of code.\n\nFirst we'll create a table in Snowflake to hold the data.\n\nIbis helps here by providing an API to access the schema from the\n**postgres**-based expression, and automatically translates postgres types into\nSnowflake types.\n\n::: {#22568a53 .cell execution_count=8}\n``` {.python .cell-code}\nsnow_table = snow_con.create_table(\"pg_batting\", schema=pg_expr.schema(), temp=True) # <1>\n```\n:::\n\n\n1. By default the table will be created in the database and schema of the\n current connection.\n\n We create a temporary table for the same reason we do with postgres above.\n\n\nWe'll show that the table is empty to sanity check ourselves.\n\n::: {#dc8f71fa .cell execution_count=9}\n``` {.python .cell-code}\nsnow_table\n```\n\n::: {.cell-output .cell-output-display execution_count=10}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓\n┃ YEAR_ID  TEAM_ID  AVG_RBI ┃\n┡━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩\n│ int64stringfloat64 │\n└─────────┴─────────┴─────────┘\n
\n```\n:::\n:::\n\n\nInsert the expression's result table into Snowflake.\n\n::: {#01bcdc29 .cell execution_count=10}\n``` {.python .cell-code}\nsnow_con.insert(\"pg_batting\", pg_expr.to_pyarrow())\n```\n:::\n\n\nTo sanity check what we've done let's peek at the table.\n\n::: {#e7a29528 .cell execution_count=11}\n``` {.python .cell-code}\nsnow_table\n```\n\n::: {.cell-output .cell-output-display execution_count=12}\n```{=html}\n
┏━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┓\n┃ YEAR_ID  TEAM_ID  AVG_RBI   ┃\n┡━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━┩\n│ int64stringfloat64   │\n├─────────┼─────────┼───────────┤\n│    1891PIT    22.782609 │\n│    1895BSN    34.363636 │\n│    1940SLA    22.343750 │\n│    1981HOU    9.972973 │\n│    1913CLE    13.512821 │\n│    1971MON    17.181818 │\n│    2008PIT    15.000000 │\n│    1895WAS    23.096774 │\n│    2011KCA    16.785714 │\n│    2007MIL    19.350000 │\n│        │\n└─────────┴─────────┴───────────┘\n
\n```\n:::\n:::\n\n\nWe'll count them too, to be extra sure.\n\n::: {#0b854a6c .cell execution_count=12}\n``` {.python .cell-code}\nsnow_table.count()\n```\n\n::: {.cell-output .cell-output-display}\n```{=html}\n
\n```\n:::\n\n::: {.cell-output .cell-output-display execution_count=13}\n\n::: {.ansi-escaped-output}\n```{=html}\n
3015
\n```\n:::\n\n:::\n:::\n\n\n## Conclusion\n\nIn this post we show how easy it is to move data from one backend into Snowflake using Ibis.\n\nPlease try it out and get in touch on [Zulip](https://ibis-project.zulipchat.com/) or\n[GitHub](https://github.com/ibis-project/ibis), we'd love to hear from you!\n\n", + "supporting": [ + "index_files" + ], + "filters": [], + "includes": { + "include-in-header": [ + "\n\n\n" + ] + } + } +} \ No newline at end of file diff --git a/docs/posts/into-snowflake/index.qmd b/docs/posts/into-snowflake/index.qmd new file mode 100644 index 000000000000..2fe6a6b85943 --- /dev/null +++ b/docs/posts/into-snowflake/index.qmd @@ -0,0 +1,143 @@ +--- +title: "Snow IO: loading data from other DBs into Snowflake" +author: "Phillip Cloud" +error: false +date: "2024-03-06" +categories: + - blog + - snowflake + - io + - productivity +--- + +## Recap + +We've [blogged about Snowflake IO before](../snowflake-io/index.qmd), in the +context of getting local files into Snowflake as fast as possible. + +In this post, we'll show how to insert query results from another system into +Snowflake, using Ibis. + +## Setup + +### Connect to your non-Snowflake system + +We'll connect to a postgres database running locally in a container. You +should be able to swap in your own connection details as needed. + +```{python} +from ibis.interactive import * # <1> + +pg_con = ibis.connect("postgres://postgres:postgres@localhost/postgres") +``` + +1. Import Ibis for maximum productivity in interactive analysis. + +We'll use a test dataset that contains some baseball batting statistics. + +Ibis provides that example data, so we can dump that into postgres. + + +```{python} +pg_batting = pg_con.create_table( + "batting", + ibis.examples.Batting.fetch().to_pandas(), # <1> + temp=True, # <2> +) +``` + +1. Yep, I'm using pandas here! +2. Use a temporary table to avoid cluttering up the database. + +### Connect to Snowflake + +```{python} +import os + +# snowflake://user:pass@account/database/schema?warehouse=my_warehouse +snow_con = ibis.connect(os.environ["SNOWFLAKE_URL"]) # <1> +``` + +1. Set the `SNOWFLAKE_URL` environment variable to your Snowflake connection string. + +## Profit + +### Construct an Ibis expression from the postgres data + +Let's build an Ibis expression based on the `batting` table in our postgres database. + +```{python} +pg_batting +``` + +We can compute the average [RBI](https://en.wikipedia.org/wiki/Run_batted_in) per year per team. + +```{python} +pg_expr = pg_batting.group_by(("year_id", "team_id")).agg(avg_rbi=_.rbi.mean()) +pg_expr +``` + +We can also rename columns to be more consistent with typical Snowflake usage. + +```{python} +pg_expr = pg_expr.rename("ALL_CAPS") +pg_expr +``` + +Let's show how many rows we have in the result. + +```{python} +pg_expr.count() +``` + +### Insert the computed results into Snowflake + +Because all Ibis backends implement the `to_pyarrow()` method, we can +get data out of another system and into Snowflake with a few lines of code. + +First we'll create a table in Snowflake to hold the data. + +Ibis helps here by providing an API to access the schema from the +**postgres**-based expression, and automatically translates postgres types into +Snowflake types. + +```{python} +snow_table = snow_con.create_table("pg_batting", schema=pg_expr.schema(), temp=True) # <1> +``` + +1. By default the table will be created in the database and schema of the + current connection. + + We create a temporary table for the same reason we do with postgres above. + + +We'll show that the table is empty to sanity check ourselves. + +```{python} +snow_table +``` + +Insert the expression's result table into Snowflake. + +```{python} +snow_con.insert("pg_batting", pg_expr.to_pyarrow()) +``` + +To sanity check what we've done let's peek at the table. + +```{python} +snow_table +``` + +We'll count them too, to be extra sure. + +```{python} +snow_table.count() +``` + +## Conclusion + +In this post we show how easy it is to move data from one backend into Snowflake using Ibis. + +Please try it out and get in touch on [Zulip](https://ibis-project.zulipchat.com/) or +[GitHub](https://github.com/ibis-project/ibis), we'd love to hear from you!