diff --git a/examples/LLM_Workflows/RAG_document_extract_chunk_embed/README.md b/examples/LLM_Workflows/RAG_document_extract_chunk_embed/README.md index dd8912f8a..ffe7cec78 100644 --- a/examples/LLM_Workflows/RAG_document_extract_chunk_embed/README.md +++ b/examples/LLM_Workflows/RAG_document_extract_chunk_embed/README.md @@ -14,3 +14,8 @@ Open it in google collab: * simple_pipeline.ipynb - this contains documentation and code. Read this. * pipeline.py - what the code in simple_pipeline.ipynb creates for easy reference * requirements.txt - python dependencies required (outside of jupyter lab) + +To exercise this example you can run it in Google Colab: + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg) +](https://colab.research.google.com/github/dagworks-inc/hamilton/blob/main/examples/LLM_Workflows/RAG_document_extract_chunk_embed/simple_pipeline.ipynb) diff --git a/examples/caching_nodes/caching_graph_adapter/README.md b/examples/caching_nodes/caching_graph_adapter/README.md index e2db91d4d..bda41bf6a 100644 --- a/examples/caching_nodes/caching_graph_adapter/README.md +++ b/examples/caching_nodes/caching_graph_adapter/README.md @@ -16,3 +16,8 @@ For iterating during development, the general process would be: its name to the adapter in the `force_compute` argument. Then, this node and its downstream nodes will be computed instead of loaded from cache. 4. When no longer required, you can just skip (3) and any caching behavior will be skipped. + +To exercise this example you can run it in Google Colab: + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg) +](https://colab.research.google.com/github/dagworks-inc/hamilton/blob/main/examples/caching_nodes/caching_graph_adapter/caching_nodes.ipynb) diff --git a/examples/caching_nodes/caching_graph_adapter/caching_nodes.ipynb b/examples/caching_nodes/caching_graph_adapter/caching_nodes.ipynb index ea5117b40..b77434ae7 100644 --- a/examples/caching_nodes/caching_graph_adapter/caching_nodes.ipynb +++ b/examples/caching_nodes/caching_graph_adapter/caching_nodes.ipynb @@ -7,6 +7,13 @@ "# Caching Nodes with Hamilton" ] }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "#! pip install pandas pyarrow sf-hamilton" + }, { "cell_type": "code", "execution_count": 1, diff --git a/examples/contrib/README.md b/examples/contrib/README.md index 0727471fd..d052c169f 100644 --- a/examples/contrib/README.md +++ b/examples/contrib/README.md @@ -9,6 +9,10 @@ For the purpose of this example, we will create a virtual environment with hamil 2. `. venv/bin/activate` (on MacOS / Linux) or `. venv/bin/Scripts` (Windows) 3. `pip install -r requirements.txt` +Or run it in Google Colab: +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg) +](https://colab.research.google.com/github/dagworks-inc/hamilton/blob/main/examples/contrib/notebook.ipynb) + # 3 ways to import There are 3 main ways to use community dataflows: static installation, dynamic installation, and local copy (see [documentation](https://github.com/DAGWorks-Inc/hamilton/tree/main/contrib)). We present each of them in this example: diff --git a/examples/contrib/notebook.ipynb b/examples/contrib/notebook.ipynb index d201d4d65..6145d862e 100644 --- a/examples/contrib/notebook.ipynb +++ b/examples/contrib/notebook.ipynb @@ -20,6 +20,13 @@ "collapsed": false } }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "# !pip install sf-hamilton-contrib" + }, { "cell_type": "markdown", "metadata": {}, diff --git a/examples/dagster/hamilton_code/notebook.ipynb b/examples/dagster/hamilton_code/notebook.ipynb index b4687aaee..0ee7dccfd 100644 --- a/examples/dagster/hamilton_code/notebook.ipynb +++ b/examples/dagster/hamilton_code/notebook.ipynb @@ -9,6 +9,13 @@ "[Tips on Hamilton + notebooks in the docs](https://hamilton.dagworks.io/en/latest/how-tos/use-in-jupyter-notebook/)" ] }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "" + }, { "cell_type": "markdown", "metadata": {}, diff --git a/examples/dask/community_demo/README.md b/examples/dask/community_demo/README.md index b28fbe2be..fb3132fb6 100644 --- a/examples/dask/community_demo/README.md +++ b/examples/dask/community_demo/README.md @@ -19,6 +19,12 @@ pip install -r requirements.txt jupyter notebook ``` +Or run it in Google Colab: +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg) +](https://colab.research.google.com/github/dagworks-inc/hamilton/blob/main/examples/dask/community_demo/demo_day_notebook.ipynb) + + + If you have questions, or need help with this example, join us on [slack](https://join.slack.com/t/hamilton-opensource/shared_invite/zt-1bjs72asx-wcUTgH7q7QX1igiQ5bbdcg), and we'll try to help! diff --git a/examples/dask/community_demo/demo_day_notebook.ipynb b/examples/dask/community_demo/demo_day_notebook.ipynb index 79d361b3a..823af1f27 100644 --- a/examples/dask/community_demo/demo_day_notebook.ipynb +++ b/examples/dask/community_demo/demo_day_notebook.ipynb @@ -1,18 +1,17 @@ { "cells": [ { + "metadata": {}, "cell_type": "code", - "execution_count": 1, - "metadata": { - "ExecuteTime": { - "end_time": "2023-05-22T22:27:16.952106Z", - "start_time": "2023-05-22T22:27:15.116241Z" - }, - "pycharm": { - "name": "#%%\n" - } - }, "outputs": [], + "execution_count": null, + "source": "#!pip install pandas \"sf-hamilton[dask,visualization]\"" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": 1, "source": [ "# Cell 1 - import the things you need\n", "import logging\n", diff --git a/examples/dask/hello_world/README.md b/examples/dask/hello_world/README.md index f936ce33a..b39c66d41 100644 --- a/examples/dask/hello_world/README.md +++ b/examples/dask/hello_world/README.md @@ -19,6 +19,12 @@ idea is that you'd swap this module out for other ways of loading data or use @c * `run_with_delayed_and_dask_objects.py` shows the combination of the above. It is slightly non-sensical, since we're entirely operating on what are dask objects effectively. But otherwise shows the code pattern to use both. +Or run it in Google Colab: +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg) +](https://colab.research.google.com/github/dagworks-inc/hamilton/blob/main/examples/dask/hello_world/notebook.ipynb) + + + # Visualization of execution Here is the graph of execution: diff --git a/examples/dlt/notebook.ipynb b/examples/dlt/notebook.ipynb index d4be18ea0..fe4d0040f 100644 --- a/examples/dlt/notebook.ipynb +++ b/examples/dlt/notebook.ipynb @@ -1,8 +1,15 @@ { "cells": [ { - "cell_type": "markdown", "metadata": {}, + "cell_type": "code", + "source": "#! pip install dlt[duckdb]>=0.3.12 ibis-framework[duckdb] openai pandas polars \"sf-hamilton[visualization]\"", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "markdown", "source": [ "# Slack Summaries\n", "This notebook shows how to ingest Slack messages and generate threads summaries.\n", @@ -31,18 +38,7 @@ }, { "cell_type": "code", - "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "slack_pipeline.dataset_name=slack_data_20240404085714\n", - "slack_pipeline.pipeline_name=slack\n" - ] - } - ], "source": [ "import dlt\n", "import slack\n", @@ -61,7 +57,9 @@ "\n", "print(f\"\"\"{slack_pipeline.dataset_name=:}\n", "{slack_pipeline.pipeline_name=:}\"\"\")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -73,24 +71,13 @@ }, { "cell_type": "code", - "execution_count": 2, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Pipeline slack load step completed in 1.05 seconds\n", - "1 load package(s) were loaded to destination duckdb and into dataset slack_data_20240404085714\n", - "The duckdb destination used duckdb:////home/tjean/projects/dagworks/hamilton/examples/dlt/slack.duckdb location to store data\n", - "Load package 1712264235.2934568 is LOADED and contains no failed jobs\n" - ] - } - ], "source": [ "load_info = slack_pipeline.run(dlt_source)\n", "print(load_info)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -114,293 +101,17 @@ }, { "cell_type": "code", - "execution_count": 3, "metadata": {}, - "outputs": [], "source": [ "%load_ext hamilton.plugins.jupyter_magic\n", "from IPython.display import display" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "code", - "execution_count": 4, "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "%3\n", - "\n", - "\n", - "cluster__legend\n", - "\n", - "Legend\n", - "\n", - "\n", - "\n", - "threads\n", - "\n", - "threads\n", - "Table\n", - "\n", - "\n", - "\n", - "insert_threads\n", - "\n", - "insert_threads\n", - "int\n", - "\n", - "\n", - "\n", - "threads->insert_threads\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "threads.with_summary\n", - "\n", - "threads.with_summary\n", - "Table\n", - "\n", - "\n", - "\n", - "threads.with_summary->threads\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "db_con\n", - "\n", - "db_con\n", - "BaseBackend\n", - "\n", - "\n", - "\n", - "channel_replies\n", - "\n", - "channel_replies\n", - "Table\n", - "\n", - "\n", - "\n", - "db_con->channel_replies\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "channel_message\n", - "\n", - "channel_message\n", - "Table\n", - "\n", - "\n", - "\n", - "db_con->channel_message\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "channel_threads\n", - "\n", - "channel_threads\n", - "Table\n", - "\n", - "\n", - "\n", - "channels_collection\n", - "\n", - "\n", - "channels_collection\n", - "Table\n", - "\n", - "\n", - "\n", - "channel_threads->channels_collection\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "summary_prompt\n", - "\n", - "summary_prompt\n", - "str\n", - "\n", - "\n", - "\n", - "summary_prompt->threads.with_summary\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "threads.with_format_messages\n", - "\n", - "threads.with_format_messages\n", - "Table\n", - "\n", - "\n", - "\n", - "threads.with_aggregate_thread\n", - "\n", - "threads.with_aggregate_thread\n", - "Table\n", - "\n", - "\n", - "\n", - "threads.with_format_messages->threads.with_aggregate_thread\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "channel_replies->channel_threads\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "channels_collection->threads.with_format_messages\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "channel_message->channel_threads\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "threads.with_aggregate_thread->threads.with_summary\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "channel\n", - "\n", - "\n", - "channel\n", - "Parallelizable\n", - "\n", - "\n", - "\n", - "channel->channel_replies\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "channel->channel_message\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "_db_con_inputs\n", - "\n", - "pipeline\n", - "Pipeline\n", - "\n", - "\n", - "\n", - "_db_con_inputs->db_con\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "_channel_replies_inputs\n", - "\n", - "pipeline\n", - "Pipeline\n", - "\n", - "\n", - "\n", - "_channel_replies_inputs->channel_replies\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "_channel_message_inputs\n", - "\n", - "pipeline\n", - "Pipeline\n", - "\n", - "\n", - "\n", - "_channel_message_inputs->channel_message\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "_channel_inputs\n", - "\n", - "selected_channels\n", - "list\n", - "\n", - "\n", - "\n", - "_channel_inputs->channel\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "input\n", - "\n", - "input\n", - "\n", - "\n", - "\n", - "function\n", - "\n", - "function\n", - "\n", - "\n", - "\n", - "expand\n", - "\n", - "\n", - "expand\n", - "\n", - "\n", - "\n", - "collect\n", - "\n", - "\n", - "collect\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "%%cell_to_module -m jupyter_transform -d\n", "\n", @@ -585,7 +296,9 @@ " threads_table = db_con.create_table(\"threads\", threads)\n", " db_con.insert(\"threads\", threads)\n", " return int(threads_table.count().execute())" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -606,9 +319,7 @@ }, { "cell_type": "code", - "execution_count": 5, "metadata": {}, - "outputs": [], "source": [ "from hamilton import driver\n", "import jupyter_transform\n", @@ -619,7 +330,9 @@ " .with_modules(jupyter_transform)\n", " .build()\n", ")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -633,353 +346,7 @@ }, { "cell_type": "code", - "execution_count": 6, "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "%3\n", - "\n", - "\n", - "cluster__legend\n", - "\n", - "Legend\n", - "\n", - "\n", - "\n", - "db_con\n", - "\n", - "db_con\n", - "BaseBackend\n", - "\n", - "\n", - "\n", - "channel_replies\n", - "\n", - "channel_replies\n", - "Table\n", - "\n", - "\n", - "\n", - "db_con->channel_replies\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "channel_message\n", - "\n", - "channel_message\n", - "Table\n", - "\n", - "\n", - "\n", - "db_con->channel_message\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "channel_threads\n", - "\n", - "channel_threads\n", - "Table\n", - "\n", - "\n", - "\n", - "channels_collection\n", - "\n", - "\n", - "channels_collection\n", - "Table\n", - "\n", - "\n", - "\n", - "channel_threads->channels_collection\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "channel\n", - "\n", - "\n", - "channel\n", - "Parallelizable\n", - "\n", - "\n", - "\n", - "channel->channel_replies\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "channel->channel_message\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "channel_replies->channel_threads\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "threads.with_format_messages\n", - "\n", - "threads.with_format_messages\n", - "Table\n", - "\n", - "\n", - "\n", - "channels_collection->threads.with_format_messages\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "channel_message->channel_threads\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "threads.with_aggregate_thread\n", - "\n", - "threads.with_aggregate_thread\n", - "Table\n", - "\n", - "\n", - "\n", - "threads.with_format_messages->threads.with_aggregate_thread\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "_db_con_inputs\n", - "\n", - "pipeline\n", - "Pipeline\n", - "\n", - "\n", - "\n", - "_db_con_inputs->db_con\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "_channel_inputs\n", - "\n", - "selected_channels\n", - "list\n", - "\n", - "\n", - "\n", - "_channel_inputs->channel\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "_channel_replies_inputs\n", - "\n", - "pipeline\n", - "Pipeline\n", - "\n", - "\n", - "\n", - "_channel_replies_inputs->channel_replies\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "_channel_message_inputs\n", - "\n", - "pipeline\n", - "Pipeline\n", - "\n", - "\n", - "\n", - "_channel_message_inputs->channel_message\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "input\n", - "\n", - "input\n", - "\n", - "\n", - "\n", - "function\n", - "\n", - "function\n", - "\n", - "\n", - "\n", - "output\n", - "\n", - "output\n", - "\n", - "\n", - "\n", - "expand\n", - "\n", - "\n", - "expand\n", - "\n", - "\n", - "\n", - "collect\n", - "\n", - "\n", - "collect\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
thread_tsthreadnum_messagesusers_dlt_load_id_dlt_id
01711048769.9083191: from the general channel again1[U06R4CY0Q65]1712264235.2934568[YzwD7S5kc4OYeA]
11711053147.0897191: along with a reply\\n 1: another message2[U06R4CY0Q65]1712264235.2934568[LItHtsbAAX6/Kg, XFxI2RmALQkXfg]
21711048757.4432091: hello world1[U06R4CY0Q65]1712264235.2934568[x/vVKj7+sMyNvg]
31711048765.7477791: general channel1[U06R4CY0Q65]1712264235.2934568[aT+AcnCycBlp8A]
41711048761.7099291: 2nd message1[U06R4CY0Q65]1712264235.2934568[ENPKHYpQW15AFg]
51711048764.7477791: my 2nd reply\\n 1: will this be picked up by...3[U06R4CY0Q65]1712264235.2934568[kEt1kEo9e0mkqQ, LatviGHcnPd8yw, lyx94g8HxvVcIA]
\n", - "
" - ], - "text/plain": [ - " thread_ts thread \\\n", - "0 1711048769.908319 1: from the general channel again \n", - "1 1711053147.089719 1: along with a reply\\n 1: another message \n", - "2 1711048757.443209 1: hello world \n", - "3 1711048765.747779 1: general channel \n", - "4 1711048761.709929 1: 2nd message \n", - "5 1711048764.747779 1: my 2nd reply\\n 1: will this be picked up by... \n", - "\n", - " num_messages users _dlt_load_id \\\n", - "0 1 [U06R4CY0Q65] 1712264235.2934568 \n", - "1 2 [U06R4CY0Q65] 1712264235.2934568 \n", - "2 1 [U06R4CY0Q65] 1712264235.2934568 \n", - "3 1 [U06R4CY0Q65] 1712264235.2934568 \n", - "4 1 [U06R4CY0Q65] 1712264235.2934568 \n", - "5 3 [U06R4CY0Q65] 1712264235.2934568 \n", - "\n", - " _dlt_id \n", - "0 [YzwD7S5kc4OYeA] \n", - "1 [LItHtsbAAX6/Kg, XFxI2RmALQkXfg] \n", - "2 [x/vVKj7+sMyNvg] \n", - "3 [aT+AcnCycBlp8A] \n", - "4 [ENPKHYpQW15AFg] \n", - "5 [kEt1kEo9e0mkqQ, LatviGHcnPd8yw, lyx94g8HxvVcIA] " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ "inputs = dict(\n", " pipeline=slack_pipeline,\n", @@ -992,7 +359,9 @@ "df = results[\"threads.with_aggregate_thread\"].to_pandas()\n", "\n", "display(dr.visualize_execution(final_vars=final_vars, inputs=inputs), df)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -1003,426 +372,28 @@ }, { "cell_type": "code", - "execution_count": 7, "metadata": {}, - "outputs": [], "source": [ "import os \n", "import getpass\n", "\n", "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter your OpenAI key\")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "code", - "execution_count": 8, "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "%3\n", - "\n", - "\n", - "cluster__legend\n", - "\n", - "Legend\n", - "\n", - "\n", - "\n", - "threads\n", - "\n", - "threads\n", - "Table\n", - "\n", - "\n", - "\n", - "threads.with_summary\n", - "\n", - "threads.with_summary\n", - "Table\n", - "\n", - "\n", - "\n", - "threads.with_summary->threads\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "db_con\n", - "\n", - "db_con\n", - "BaseBackend\n", - "\n", - "\n", - "\n", - "channel_replies\n", - "\n", - "channel_replies\n", - "Table\n", - "\n", - "\n", - "\n", - "db_con->channel_replies\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "channel_message\n", - "\n", - "channel_message\n", - "Table\n", - "\n", - "\n", - "\n", - "db_con->channel_message\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "channel_threads\n", - "\n", - "channel_threads\n", - "Table\n", - "\n", - "\n", - "\n", - "channels_collection\n", - "\n", - "\n", - "channels_collection\n", - "Table\n", - "\n", - "\n", - "\n", - "channel_threads->channels_collection\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "channel\n", - "\n", - "\n", - "channel\n", - "Parallelizable\n", - "\n", - "\n", - "\n", - "channel->channel_replies\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "channel->channel_message\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "summary_prompt\n", - "\n", - "summary_prompt\n", - "str\n", - "\n", - "\n", - "\n", - "summary_prompt->threads.with_summary\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "channel_replies->channel_threads\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "threads.with_format_messages\n", - "\n", - "threads.with_format_messages\n", - "Table\n", - "\n", - "\n", - "\n", - "channels_collection->threads.with_format_messages\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "channel_message->channel_threads\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "threads.with_aggregate_thread\n", - "\n", - "threads.with_aggregate_thread\n", - "Table\n", - "\n", - "\n", - "\n", - "threads.with_aggregate_thread->threads.with_summary\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "threads.with_format_messages->threads.with_aggregate_thread\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "_db_con_inputs\n", - "\n", - "pipeline\n", - "Pipeline\n", - "\n", - "\n", - "\n", - "_db_con_inputs->db_con\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "_channel_inputs\n", - "\n", - "selected_channels\n", - "list\n", - "\n", - "\n", - "\n", - "_channel_inputs->channel\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "_channel_replies_inputs\n", - "\n", - "pipeline\n", - "Pipeline\n", - "\n", - "\n", - "\n", - "_channel_replies_inputs->channel_replies\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "_channel_message_inputs\n", - "\n", - "pipeline\n", - "Pipeline\n", - "\n", - "\n", - "\n", - "_channel_message_inputs->channel_message\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "input\n", - "\n", - "input\n", - "\n", - "\n", - "\n", - "function\n", - "\n", - "function\n", - "\n", - "\n", - "\n", - "output\n", - "\n", - "output\n", - "\n", - "\n", - "\n", - "expand\n", - "\n", - "\n", - "expand\n", - "\n", - "\n", - "\n", - "collect\n", - "\n", - "\n", - "collect\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
thread_tsthreadnum_messagesusers_dlt_load_id_dlt_idsummary
01711048769.9083191: from the general channel again1[U06R4CY0Q65]1712264235.2934568[YzwD7S5kc4OYeA]User1: Has anyone encountered issues with Hami...
11711053147.0897191: another message\\n 1: along with a reply2[U06R4CY0Q65]1712264235.2934568[XFxI2RmALQkXfg, LItHtsbAAX6/Kg]User1:\\n Hey everyone, I'm having troub...
21711048757.4432091: hello world1[U06R4CY0Q65]1712264235.2934568[x/vVKj7+sMyNvg]2: Hi User1, what's up?\\n \\n 3: ...
31711048761.7099291: 2nd message1[U06R4CY0Q65]1712264235.2934568[ENPKHYpQW15AFg]User1: Hi everyone, I've been using Hamilton f...
41711048765.7477791: general channel1[U06R4CY0Q65]1712264235.2934568[aT+AcnCycBlp8A]User1: \"I've been trying to use Hamilton for m...
51711048764.7477791: my 2nd reply\\n 1: my 1st reply\\n 1: will th...3[U06R4CY0Q65]1712264235.2934568[kEt1kEo9e0mkqQ, lyx94g8HxvVcIA, LatviGHcnPd8yw]User1's issue: User1 is questioning whether ce...
\n", - "
" - ], - "text/plain": [ - " thread_ts thread \\\n", - "0 1711048769.908319 1: from the general channel again \n", - "1 1711053147.089719 1: another message\\n 1: along with a reply \n", - "2 1711048757.443209 1: hello world \n", - "3 1711048761.709929 1: 2nd message \n", - "4 1711048765.747779 1: general channel \n", - "5 1711048764.747779 1: my 2nd reply\\n 1: my 1st reply\\n 1: will th... \n", - "\n", - " num_messages users _dlt_load_id \\\n", - "0 1 [U06R4CY0Q65] 1712264235.2934568 \n", - "1 2 [U06R4CY0Q65] 1712264235.2934568 \n", - "2 1 [U06R4CY0Q65] 1712264235.2934568 \n", - "3 1 [U06R4CY0Q65] 1712264235.2934568 \n", - "4 1 [U06R4CY0Q65] 1712264235.2934568 \n", - "5 3 [U06R4CY0Q65] 1712264235.2934568 \n", - "\n", - " _dlt_id \\\n", - "0 [YzwD7S5kc4OYeA] \n", - "1 [XFxI2RmALQkXfg, LItHtsbAAX6/Kg] \n", - "2 [x/vVKj7+sMyNvg] \n", - "3 [ENPKHYpQW15AFg] \n", - "4 [aT+AcnCycBlp8A] \n", - "5 [kEt1kEo9e0mkqQ, lyx94g8HxvVcIA, LatviGHcnPd8yw] \n", - "\n", - " summary \n", - "0 User1: Has anyone encountered issues with Hami... \n", - "1 User1:\\n Hey everyone, I'm having troub... \n", - "2 2: Hi User1, what's up?\\n \\n 3: ... \n", - "3 User1: Hi everyone, I've been using Hamilton f... \n", - "4 User1: \"I've been trying to use Hamilton for m... \n", - "5 User1's issue: User1 is questioning whether ce... " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ "final_vars = [\"threads\"] # replace by `\"insert_threads\"` to directly store results\n", "results = dr.execute(final_vars, inputs=inputs)\n", "df2 = results[\"threads\"].to_pandas()\n", "\n", "display(dr.visualize_execution(final_vars=final_vars, inputs=inputs), df2)" - ] + ], + "outputs": [], + "execution_count": null } ], "metadata": { diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md index 9a394beea..158186438 100644 --- a/examples/hello_world/README.md +++ b/examples/hello_world/README.md @@ -18,6 +18,12 @@ To run things: > python my_script.py ``` +To exercise this example you can run it in Google Colab: + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg) +](https://colab.research.google.com/github/dagworks-inc/hamilton/blob/main/examples/hello_world/my_notebook.ipynb) + + If you have questions, or need help with this example, join us on [slack](https://join.slack.com/t/hamilton-opensource/shared_invite/zt-1bjs72asx-wcUTgH7q7QX1igiQ5bbdcg), and we'll try to help! diff --git a/examples/hello_world/my_notebook.ipynb b/examples/hello_world/my_notebook.ipynb index 86337d2bd..910020f1f 100644 --- a/examples/hello_world/my_notebook.ipynb +++ b/examples/hello_world/my_notebook.ipynb @@ -2,88 +2,278 @@ "cells": [ { "cell_type": "markdown", - "source": [ - "Uncomment and run the cell below if you are in a Google Colab environment. It will:\n", - "1. Mount google drive. You will be asked to authenticate and give permissions.\n", - "2. Change directory to google drive.\n", - "3. Make a directory \"hamilton-tutorials\"\n", - "4. Change directory to it.\n", - "5. Clone this repository to your google drive\n", - "6. Move your current directory to the hello_world example\n", - "7. Install requirements.\n", - "\n", - "This means that any modifications will be saved, and you won't lose them if you close your browser." - ], "metadata": { - "collapsed": false - } + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "# Hello World Example\n", + "This uses the jupyter magic commands to create a simple example of how to use Hamilton." + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [], "source": [ - "## 1. Mount google drive\n", - "# from google.colab import drive\n", - "# drive.mount('/content/drive')\n", - "## 2. Change directory to google drive.\n", - "# %cd /content/drive/MyDrive\n", - "## 3. Make a directory \"hamilton-tutorials\"\n", - "# !mkdir hamilton-tutorials\n", - "## 4. Change directory to it.\n", - "# %cd hamilton-tutorials\n", - "## 5. Clone this repository to your google drive\n", - "# !git clone https://github.com/DAGWorks-Inc/hamilton/\n", - "## 6. Move your current directory to the hello_world example\n", - "# %cd hamilton/examples/hello_world\n", - "## 7. Install requirements.\n", - "# %pip install -r requirements.txt\n", - "# clear_output() # optionally clear outputs\n", - "# To check your current working directory you can type `!pwd` in a cell and run it." - ], - "metadata": { - "collapsed": false - } + "#! pip install pandas \"sf-hamilton[visualization]\"" + ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2024-06-05T05:39:54.379474Z", + "start_time": "2024-06-05T05:39:47.097691Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [], "source": [ "# Cell 1 - import the things you need\n", "import logging\n", "import sys\n", + "from hamilton import driver\n", "\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from hamilton import ad_hoc_utils, driver\n", - "\n", + "%load_ext hamilton.plugins.jupyter_magic\n", "logging.basicConfig(stream=sys.stdout)" - ], + ] + }, + { + "cell_type": "code", + "execution_count": 2, "metadata": { - "collapsed": false, "ExecuteTime": { - "end_time": "2023-11-08T00:06:17.334441Z", - "start_time": "2023-11-08T00:06:13.711650Z" + "end_time": "2024-06-05T05:41:01.783934Z", + "start_time": "2024-06-05T05:41:01.152301Z" } - } + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "cluster__legend\n", + "\n", + "Legend\n", + "\n", + "\n", + "\n", + "spend_std_dev\n", + "\n", + "spend_std_dev\n", + "float\n", + "\n", + "\n", + "\n", + "spend_zero_mean_unit_variance\n", + "\n", + "spend_zero_mean_unit_variance\n", + "Series\n", + "\n", + "\n", + "\n", + "spend_std_dev->spend_zero_mean_unit_variance\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend_zero_mean\n", + "\n", + "spend_zero_mean\n", + "Series\n", + "\n", + "\n", + "\n", + "spend_zero_mean->spend_zero_mean_unit_variance\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "avg_3wk_spend\n", + "\n", + "avg_3wk_spend\n", + "Series\n", + "\n", + "\n", + "\n", + "spend_mean\n", + "\n", + "spend_mean\n", + "float\n", + "\n", + "\n", + "\n", + "spend_mean->spend_zero_mean\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend_per_signup\n", + "\n", + "spend_per_signup\n", + "Series\n", + "\n", + "\n", + "\n", + "_spend_std_dev_inputs\n", + "\n", + "spend\n", + "Series\n", + "\n", + "\n", + "\n", + "_spend_std_dev_inputs->spend_std_dev\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "_spend_zero_mean_inputs\n", + "\n", + "spend\n", + "Series\n", + "\n", + "\n", + "\n", + "_spend_zero_mean_inputs->spend_zero_mean\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "_avg_3wk_spend_inputs\n", + "\n", + "spend\n", + "Series\n", + "\n", + "\n", + "\n", + "_avg_3wk_spend_inputs->avg_3wk_spend\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "_spend_mean_inputs\n", + "\n", + "spend\n", + "Series\n", + "\n", + "\n", + "\n", + "_spend_mean_inputs->spend_mean\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "_spend_per_signup_inputs\n", + "\n", + "spend\n", + "Series\n", + "signups\n", + "Series\n", + "\n", + "\n", + "\n", + "_spend_per_signup_inputs->spend_per_signup\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "input\n", + "\n", + "input\n", + "\n", + "\n", + "\n", + "function\n", + "\n", + "function\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%cell_to_module my_functions --display\n", + "# Cell 2 - Define your functions in a module.\n", + "import pandas as pd\n", + "\n", + "def avg_3wk_spend(spend: pd.Series) -> pd.Series:\n", + " \"\"\"Rolling 3 week average spend.\"\"\"\n", + " return spend.rolling(3).mean()\n", + "\n", + "def spend_per_signup(spend: pd.Series, signups: pd.Series) -> pd.Series:\n", + " \"\"\"The cost per signup in relation to spend.\"\"\"\n", + " return spend / signups\n", + "\n", + "def spend_mean(spend: pd.Series) -> float:\n", + " \"\"\"Shows function creating a scalar. In this case it computes the mean of the entire column.\"\"\"\n", + " return spend.mean()\n", + "\n", + "def spend_zero_mean(spend: pd.Series, spend_mean: float) -> pd.Series:\n", + " \"\"\"Shows function that takes a scalar. In this case to zero mean spend.\"\"\"\n", + " return spend - spend_mean\n", + "\n", + "def spend_std_dev(spend: pd.Series) -> float:\n", + " \"\"\"Function that computes the standard deviation of the spend column.\"\"\"\n", + " return spend.std()\n", + "\n", + "def spend_zero_mean_unit_variance(spend_zero_mean: pd.Series, spend_std_dev: float) -> pd.Series:\n", + " \"\"\"Function showing one way to make spend have zero mean and unit variance.\"\"\"\n", + " return spend_zero_mean / spend_std_dev\n" + ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { - "pycharm": { - "name": "#%%\n" - }, "ExecuteTime": { "end_time": "2023-11-08T00:06:17.404504Z", "start_time": "2023-11-08T00:06:17.343242Z" + }, + "pycharm": { + "name": "#%%\n" } }, "outputs": [], "source": [ - "# Cell 2 - import modules to create part of the DAG from\n", + "# Cell 3 - Optional - if you have existing modules you can import them here.\n", "# We use the autoreload extension that comes with ipython to automatically reload modules when\n", "# the code in them changes.\n", "\n", @@ -94,78 +284,134 @@ "# import the function modules you want to reload when they change.\n", "# i.e. these should be your modules you write your functions in. As you change them,\n", "# they will be reimported without you having to do anything.\n", - "%aimport my_functions" + "# %aimport NAME_OF_MODULE # uncomment and replace NAME_OF_MODULE with the name of your module" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { + "ExecuteTime": { + "end_time": "2024-06-05T05:43:08.346466Z", + "start_time": "2024-06-05T05:43:07.937399Z" + }, "pycharm": { "name": "#%%\n" - }, - "ExecuteTime": { - "end_time": "2023-11-08T00:06:17.419982Z", - "start_time": "2023-11-08T00:06:17.411216Z" } }, - "outputs": [], + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "cluster__legend\n", + "\n", + "Legend\n", + "\n", + "\n", + "\n", + "spend\n", + "\n", + "spend\n", + "Series\n", + "\n", + "\n", + "\n", + "log_spend_per_signup\n", + "\n", + "log_spend_per_signup\n", + "Series\n", + "\n", + "\n", + "\n", + "signups\n", + "\n", + "signups\n", + "Series\n", + "\n", + "\n", + "\n", + "_log_spend_per_signup_inputs\n", + "\n", + "spend_per_signup\n", + "Series\n", + "\n", + "\n", + "\n", + "_log_spend_per_signup_inputs->log_spend_per_signup\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "input\n", + "\n", + "input\n", + "\n", + "\n", + "\n", + "function\n", + "\n", + "function\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "# Cell 3 - Define your new Hamilton functions & curate them into a TemporaryFunctionModule object.\n", - "# This enables you to add functions to your DAG without creating a proper module.\n", - "# This is ONLY INTENDED FOR QUICK DEVELOPMENT. For moving to production move these to an actual module.\n", - "\n", + "%%cell_to_module extra_functions --display\n", + "# Cell 4 - Define your new Hamilton functions \n", + "import numpy as np\n", + "import pandas as pd\n", "# Look at `my_functions` to see how these functions connect.\n", "def signups() -> pd.Series:\n", " \"\"\"Returns sign up values\"\"\"\n", " return pd.Series([1, 10, 50, 100, 200, 400])\n", "\n", - "\n", "def spend() -> pd.Series:\n", " \"\"\"Returns the spend values\"\"\"\n", " return pd.Series([10, 10, 20, 40, 40, 50])\n", "\n", - "\n", "def log_spend_per_signup(spend_per_signup: pd.Series) -> pd.Series:\n", " \"\"\"Simple function taking the logarithm of spend over signups.\"\"\"\n", - " return np.log(spend_per_signup)\n", - "\n", - "\n", - "# Place the functions into a temporary module -- the idea is that this should house a curated set of functions.\n", - "# Don't be afraid to make multiple of them -- however we'd advise you to not use this method for production.\n", - "# Also note, that using a temporary function module does not work for scaling onto Ray, Dask, or Pandas on Spark.\n", - "temp_module = ad_hoc_utils.create_temporary_module(\n", - " spend, signups, log_spend_per_signup, module_name=\"function_example\"\n", - ")" + " return np.log(spend_per_signup)" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": { + "ExecuteTime": { + "end_time": "2024-06-05T05:43:42.229169Z", + "start_time": "2024-06-05T05:43:42.210553Z" + }, "pycharm": { "name": "#%%\n" - }, - "ExecuteTime": { - "end_time": "2023-11-08T00:06:17.461900Z", - "start_time": "2023-11-08T00:06:17.422033Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:hamilton.telemetry:Note: Hamilton collects completely anonymous data about usage. This will help us improve Hamilton over time. See https://github.com/dagworks-inc/hamilton#usage-analytics--data-privacy for details.\n" - ] - } - ], + "outputs": [], "source": [ - "# Cell 4 - Instantiate the Hamilton driver and pass it the right things in.\n", - "\n", + "# Cell 5 - Instantiate the Hamilton driver and pass it the right things in.\n", "initial_config = {}\n", - "# we need to tell hamilton where to load function definitions from\n", - "dr = driver.Driver(initial_config, my_functions, temp_module) # can pass in multiple modules\n", + "# my_functions and extra_functions are the modules we created in cells 2 and 4.\n", + "dr = driver.Driver(initial_config, my_functions, extra_functions) # can pass in multiple modules\n", "# we need to specify what we want in the final dataframe.\n", "output_columns = [\n", " \"spend\",\n", @@ -178,20 +424,169 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2023-11-08T00:06:18.080346Z", - "start_time": "2023-11-08T00:06:17.452028Z" + "end_time": "2024-06-05T05:43:52.023683Z", + "start_time": "2024-06-05T05:43:51.626814Z" } }, "outputs": [ { "data": { - "image/svg+xml": "\n\n\n\n\n\n\n\ncluster__legend\n\nLegend\n\n\n\nspend_mean\n\nspend_mean\nfloat\n\n\n\nspend_zero_mean\n\nspend_zero_mean\nSeries\n\n\n\nspend_mean->spend_zero_mean\n\n\n\n\n\nspend_std_dev\n\nspend_std_dev\nfloat\n\n\n\nspend_zero_mean_unit_variance\n\nspend_zero_mean_unit_variance\nSeries\n\n\n\nspend_std_dev->spend_zero_mean_unit_variance\n\n\n\n\n\navg_3wk_spend\n\navg_3wk_spend\nSeries\n\n\n\nspend\n\nspend\nSeries\n\n\n\nspend->spend_mean\n\n\n\n\n\nspend->spend_std_dev\n\n\n\n\n\nspend->avg_3wk_spend\n\n\n\n\n\nspend_per_signup\n\nspend_per_signup\nSeries\n\n\n\nspend->spend_per_signup\n\n\n\n\n\nspend->spend_zero_mean\n\n\n\n\n\nlog_spend_per_signup\n\nlog_spend_per_signup\nSeries\n\n\n\nspend_per_signup->log_spend_per_signup\n\n\n\n\n\nspend_zero_mean->spend_zero_mean_unit_variance\n\n\n\n\n\nsignups\n\nsignups\nSeries\n\n\n\nsignups->spend_per_signup\n\n\n\n\n\nfunction\n\nfunction\n\n\n\n", - "text/plain": "" + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "cluster__legend\n", + "\n", + "Legend\n", + "\n", + "\n", + "\n", + "signups\n", + "\n", + "signups\n", + "Series\n", + "\n", + "\n", + "\n", + "spend_per_signup\n", + "\n", + "spend_per_signup\n", + "Series\n", + "\n", + "\n", + "\n", + "signups->spend_per_signup\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend_std_dev\n", + "\n", + "spend_std_dev\n", + "float\n", + "\n", + "\n", + "\n", + "spend_zero_mean_unit_variance\n", + "\n", + "spend_zero_mean_unit_variance\n", + "Series\n", + "\n", + "\n", + "\n", + "spend_std_dev->spend_zero_mean_unit_variance\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend_zero_mean\n", + "\n", + "spend_zero_mean\n", + "Series\n", + "\n", + "\n", + "\n", + "spend_zero_mean->spend_zero_mean_unit_variance\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "log_spend_per_signup\n", + "\n", + "log_spend_per_signup\n", + "Series\n", + "\n", + "\n", + "\n", + "avg_3wk_spend\n", + "\n", + "avg_3wk_spend\n", + "Series\n", + "\n", + "\n", + "\n", + "spend\n", + "\n", + "spend\n", + "Series\n", + "\n", + "\n", + "\n", + "spend->spend_std_dev\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend->spend_zero_mean\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend->avg_3wk_spend\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend_mean\n", + "\n", + "spend_mean\n", + "float\n", + "\n", + "\n", + "\n", + "spend->spend_mean\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend->spend_per_signup\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend_mean->spend_zero_mean\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend_per_signup->log_spend_per_signup\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "function\n", + "\n", + "function\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -206,23 +601,165 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": { + "ExecuteTime": { + "end_time": "2024-06-05T05:43:52.549992Z", + "start_time": "2024-06-05T05:43:52.259829Z" + }, "pycharm": { "name": "#%%\n" - }, - "ExecuteTime": { - "end_time": "2023-11-08T00:06:18.639068Z", - "start_time": "2023-11-08T00:06:18.070616Z" } }, "outputs": [ { "data": { - "image/svg+xml": "\n\n\n\n\n\n\n\ncluster__legend\n\nLegend\n\n\n\nspend_mean\n\nspend_mean\nfloat\n\n\n\nspend_zero_mean\n\nspend_zero_mean\nSeries\n\n\n\nspend_mean->spend_zero_mean\n\n\n\n\n\nspend\n\nspend\nSeries\n\n\n\nspend->spend_mean\n\n\n\n\n\navg_3wk_spend\n\navg_3wk_spend\nSeries\n\n\n\nspend->avg_3wk_spend\n\n\n\n\n\nspend_std_dev\n\nspend_std_dev\nfloat\n\n\n\nspend->spend_std_dev\n\n\n\n\n\nspend_per_signup\n\nspend_per_signup\nSeries\n\n\n\nspend->spend_per_signup\n\n\n\n\n\nspend->spend_zero_mean\n\n\n\n\n\nspend_zero_mean_unit_variance\n\nspend_zero_mean_unit_variance\nSeries\n\n\n\nspend_std_dev->spend_zero_mean_unit_variance\n\n\n\n\n\nspend_zero_mean->spend_zero_mean_unit_variance\n\n\n\n\n\nsignups\n\nsignups\nSeries\n\n\n\nsignups->spend_per_signup\n\n\n\n\n\nfunction\n\nfunction\n\n\n\noutput\n\noutput\n\n\n\n", - "text/plain": "" + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "cluster__legend\n", + "\n", + "Legend\n", + "\n", + "\n", + "\n", + "signups\n", + "\n", + "signups\n", + "Series\n", + "\n", + "\n", + "\n", + "spend_per_signup\n", + "\n", + "spend_per_signup\n", + "Series\n", + "\n", + "\n", + "\n", + "signups->spend_per_signup\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend_std_dev\n", + "\n", + "spend_std_dev\n", + "float\n", + "\n", + "\n", + "\n", + "spend_zero_mean_unit_variance\n", + "\n", + "spend_zero_mean_unit_variance\n", + "Series\n", + "\n", + "\n", + "\n", + "spend_std_dev->spend_zero_mean_unit_variance\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend_zero_mean\n", + "\n", + "spend_zero_mean\n", + "Series\n", + "\n", + "\n", + "\n", + "spend_zero_mean->spend_zero_mean_unit_variance\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend\n", + "\n", + "spend\n", + "Series\n", + "\n", + "\n", + "\n", + "spend->spend_std_dev\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend->spend_zero_mean\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "avg_3wk_spend\n", + "\n", + "avg_3wk_spend\n", + "Series\n", + "\n", + "\n", + "\n", + "spend->avg_3wk_spend\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend_mean\n", + "\n", + "spend_mean\n", + "float\n", + "\n", + "\n", + "\n", + "spend->spend_mean\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend->spend_per_signup\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend_mean->spend_zero_mean\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "function\n", + "\n", + "function\n", + "\n", + "\n", + "\n", + "output\n", + "\n", + "output\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -234,14 +771,121 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2024-06-05T05:43:54.899373Z", + "start_time": "2024-06-05T05:43:54.609411Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "image/svg+xml": "\n\n\n\n\n\n\n\ncluster__legend\n\nLegend\n\n\n\nspend_mean\n\nspend_mean\nfloat\n\n\n\nspend_zero_mean\n\nspend_zero_mean\nSeries\n\n\n\nspend_mean->spend_zero_mean\n\n\n\n\n\nspend\n\nspend\nSeries\n\n\n\nspend->spend_mean\n\n\n\n\n\nspend_std_dev\n\nspend_std_dev\nfloat\n\n\n\nspend->spend_std_dev\n\n\n\n\n\nspend->spend_zero_mean\n\n\n\n\n\nspend_zero_mean_unit_variance\n\nspend_zero_mean_unit_variance\nSeries\n\n\n\nspend_std_dev->spend_zero_mean_unit_variance\n\n\n\n\n\nspend_zero_mean->spend_zero_mean_unit_variance\n\n\n\n\n\nfunction\n\nfunction\n\n\n\n", - "text/plain": "" + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "cluster__legend\n", + "\n", + "Legend\n", + "\n", + "\n", + "\n", + "spend_std_dev\n", + "\n", + "spend_std_dev\n", + "float\n", + "\n", + "\n", + "\n", + "spend_zero_mean_unit_variance\n", + "\n", + "spend_zero_mean_unit_variance\n", + "Series\n", + "\n", + "\n", + "\n", + "spend_std_dev->spend_zero_mean_unit_variance\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend_zero_mean\n", + "\n", + "spend_zero_mean\n", + "Series\n", + "\n", + "\n", + "\n", + "spend_zero_mean->spend_zero_mean_unit_variance\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend\n", + "\n", + "spend\n", + "Series\n", + "\n", + "\n", + "\n", + "spend->spend_std_dev\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend->spend_zero_mean\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend_mean\n", + "\n", + "spend_mean\n", + "float\n", + "\n", + "\n", + "\n", + "spend->spend_mean\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend_mean->spend_zero_mean\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "function\n", + "\n", + "function\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -250,31 +894,118 @@ "# visualize the path of execution between two functions\n", "dr.visualize_path_between(\"spend_mean\", \"spend_zero_mean_unit_variance\",\n", " strict_path_visualization=False)" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-11-08T00:06:19.243065Z", - "start_time": "2023-11-08T00:06:18.632623Z" - } - } + ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2023-11-08T00:06:19.580251Z", - "start_time": "2023-11-08T00:06:19.520006Z" + "end_time": "2024-06-05T05:43:57.619299Z", + "start_time": "2024-06-05T05:43:57.580545Z" } }, "outputs": [ { "data": { - "text/plain": " spend signups avg_3wk_spend spend_per_signup \\\n0 10 1 NaN 10.000 \n1 10 10 NaN 1.000 \n2 20 50 13.333333 0.400 \n3 40 100 23.333333 0.400 \n4 40 200 33.333333 0.200 \n5 50 400 43.333333 0.125 \n\n spend_zero_mean_unit_variance \n0 -1.064405 \n1 -1.064405 \n2 -0.483821 \n3 0.677349 \n4 0.677349 \n5 1.257934 ", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
spendsignupsavg_3wk_spendspend_per_signupspend_zero_mean_unit_variance
0101NaN10.000-1.064405
11010NaN1.000-1.064405
2205013.3333330.400-0.483821
34010023.3333330.4000.677349
44020033.3333330.2000.677349
55040043.3333330.1251.257934
\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
spendsignupsavg_3wk_spendspend_per_signupspend_zero_mean_unit_variance
0101NaN10.000-1.064405
11010NaN1.000-1.064405
2205013.3333330.400-0.483821
34010023.3333330.4000.677349
44020033.3333330.2000.677349
55040043.3333330.1251.257934
\n", + "
" + ], + "text/plain": [ + " spend signups avg_3wk_spend spend_per_signup \\\n", + "0 10 1 NaN 10.000 \n", + "1 10 10 NaN 1.000 \n", + "2 20 50 13.333333 0.400 \n", + "3 40 100 23.333333 0.400 \n", + "4 40 200 33.333333 0.200 \n", + "5 50 400 43.333333 0.125 \n", + "\n", + " spend_zero_mean_unit_variance \n", + "0 -1.064405 \n", + "1 -1.064405 \n", + "2 -0.483821 \n", + "3 0.677349 \n", + "4 0.677349 \n", + "5 1.257934 " + ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -313,9 +1044,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.9.13" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } diff --git a/examples/jupyter_notebook_magic/README.md b/examples/jupyter_notebook_magic/README.md index 9c0a4a73d..dba35afe7 100644 --- a/examples/jupyter_notebook_magic/README.md +++ b/examples/jupyter_notebook_magic/README.md @@ -24,4 +24,9 @@ def reply(joke_prompt: str) -> str: return f"{right} who?" ``` -Go explore `tutorial.ipynb` to learn about all interactive features! +Go explore `example.ipynb` to learn about all interactive features! + +To exercise this example you can run it in Google Colab too: + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg) +](https://colab.research.google.com/github/dagworks-inc/hamilton/blob/main/examples/jupyter_notebook_magic/example.ipynb) diff --git a/examples/model_examples/scikit-learn/Hamilton for ML dataflows.ipynb b/examples/model_examples/scikit-learn/Hamilton_for_ML_dataflows.ipynb similarity index 100% rename from examples/model_examples/scikit-learn/Hamilton for ML dataflows.ipynb rename to examples/model_examples/scikit-learn/Hamilton_for_ML_dataflows.ipynb diff --git a/examples/model_examples/scikit-learn/README.md b/examples/model_examples/scikit-learn/README.md index 2ca166409..a877a18d8 100644 --- a/examples/model_examples/scikit-learn/README.md +++ b/examples/model_examples/scikit-learn/README.md @@ -24,6 +24,11 @@ house the same function names as they should map to the inputs required by funct * run.py houses the "driver code" required to stitch everything together. It is responsible for creating the right configuration to create the DAG, as well as determining what python modules should be loaded. +You can even run this example in Google Colab: +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg) +](https://colab.research.google.com/github/dagworks-inc/hamilton/blob/main/examples/model_examples/scikit-learn/Hamilton_for_ML_dataflows.ipynb) + + # Visualization of execution Here is the graph of execution for the digits data set and logistic regression model: diff --git a/examples/model_examples/time-series/Hamilton - Time Series model.ipynb b/examples/model_examples/time-series/Hamilton-TimeSeriesmodel.ipynb similarity index 100% rename from examples/model_examples/time-series/Hamilton - Time Series model.ipynb rename to examples/model_examples/time-series/Hamilton-TimeSeriesmodel.ipynb diff --git a/examples/model_examples/time-series/README.md b/examples/model_examples/time-series/README.md index 50a2d4825..e7de11d31 100644 --- a/examples/model_examples/time-series/README.md +++ b/examples/model_examples/time-series/README.md @@ -16,6 +16,12 @@ you will need to log in to Kaggle to download the data. 3. Decompress the data into the same folder as the code. 4. Run `run.py`. `python run.py`. +You can even run this example in Google Colab: +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg) +](https://colab.research.google.com/github/dagworks-inc/hamilton/blob/main/examples/model_examples/time-series/Hamilton-TimeSeriesmodel.ipynb) + + + # Notes Here's what this code executes: ![kaggle_submission_df](kaggle_submission_df.dot.png) diff --git a/examples/numpy/air-quality-analysis/README.md b/examples/numpy/air-quality-analysis/README.md index 31c803855..c0088d1f1 100644 --- a/examples/numpy/air-quality-analysis/README.md +++ b/examples/numpy/air-quality-analysis/README.md @@ -20,6 +20,12 @@ Is where the driver code lives to create the DAG and exercise it. To exercise it: > python run_analysis.py +You can even run this example in Google Colab: +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg) +](https://colab.research.google.com/github/dagworks-inc/hamilton/blob/main/examples/numpy/air-quality-analysis/hamilton_notebook.ipynb) + + + # Caveat The code found here was copied and pasted, and then tweaked to run with Hamilton. If something from the modeling perspective isn't clear, please read https://github.com/numpy/numpy-tutorials/blob/main/content/tutorial-air-quality-analysis.md diff --git a/examples/people_data_labs/README.md b/examples/people_data_labs/README.md index ad53bde95..d90c48fbe 100644 --- a/examples/people_data_labs/README.md +++ b/examples/people_data_labs/README.md @@ -34,6 +34,12 @@ This example showcases how Hamilton can help you write modular data transformati python run.py ``` +You can even run this example in Google Colab: +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg) +](https://colab.research.google.com/github/dagworks-inc/hamilton/blob/main/examples/people_data_labs/notebook.ipynb) + + + ## Resources - [PDL Blog](https://blog.peopledatalabs.com/) and [PDL Recipes](https://docs.peopledatalabs.com/recipes) - [Interactive Hamilton training](https://www.tryhamilton.dev/hamilton-basics/jumping-in) diff --git a/examples/polars/README.md b/examples/polars/README.md index 2ab3efb1f..1af5c3be2 100644 --- a/examples/polars/README.md +++ b/examples/polars/README.md @@ -17,6 +17,12 @@ To run things: > python my_script.py ``` +You can even run this example in Google Colab: +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg) +](https://colab.research.google.com/github/dagworks-inc/hamilton/blob/main/examples/polars/notebook.ipynb) + + + # Visualizing Execution Here is the graph of execution - which should look the same as the pandas example: diff --git a/examples/prefect/README.md b/examples/prefect/README.md index 41977a406..7dae05ac9 100644 --- a/examples/prefect/README.md +++ b/examples/prefect/README.md @@ -36,6 +36,12 @@ The easiest way to get this example running is to sign up for Prefect's free tie 4. Login to Prefect with your local machine using `prefect cloud login` 5. Execute the workflow by running `python run.py`. You should see a new run appear on your dashboard at https://app.prefect.cloud/ +6. You can even run this example in Google Colab: +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg) +](https://colab.research.google.com/github/dagworks-inc/hamilton/blob/main/examples/prefect/hamilton_prefect_notebook.ipynb) + + + ## Tips 1. Use Prefect [Blocks](https://docs.prefect.io/latest/concepts/blocks/) to store your Hamilton configuration. This way, you can edit it directly from your Prefect dashboard to launch different runs without altering your source code. ![blocks](./docs/prefect_config_block.JPG) diff --git a/examples/scikit-learn/README.md b/examples/scikit-learn/README.md index 95a397a47..1b2d54417 100644 --- a/examples/scikit-learn/README.md +++ b/examples/scikit-learn/README.md @@ -11,6 +11,12 @@ To run things: ```bash > python run.py ``` +You can even run this example in Google Colab: +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg) +](https://colab.research.google.com/github/dagworks-inc/hamilton/blob/main/examples/scikit-learn/hamilton_notebook.ipynb) + + + # DAG Visualization: Here is the visualization of the execution that the transformer currently performs if you run `run.py`: diff --git a/examples/slack/README.md b/examples/slack/README.md index 0339f3d92..f6e9b3ead 100644 --- a/examples/slack/README.md +++ b/examples/slack/README.md @@ -4,3 +4,7 @@ This example demonstrates how to use the Slack notifier. Fill in the details in `slack_notification_example.py` and then run it `python slack_notification_example.py`. You should see a message in your Slack channel. + +You can even run this example in Google Colab: +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg) +](https://colab.research.google.com/github/dagworks-inc/hamilton/blob/main/examples/slack/notebook.ipynb) diff --git a/hamilton/function_modifiers/delayed.py b/hamilton/function_modifiers/delayed.py index c4b8b9659..92c6df47b 100644 --- a/hamilton/function_modifiers/delayed.py +++ b/hamilton/function_modifiers/delayed.py @@ -76,11 +76,11 @@ def summation(df: pd.DataFrame, s1: str, s2: str) -> pd.Series: from hamilton.function_modifiers import resolve, ResolveAt @resolve( - when=ResolveAt.CONFIG_AVAILABLE + when=ResolveAt.CONFIG_AVAILABLE, decorate_with=lambda first_series_sum, second_series_sum: parameterize_sources( series_sum_1={"s1": first_series_sum[0], "s2": second_series_sum[1]}, series_sum_2={"s1": second_series_sum[1], "s2": second_series_sum[2]}, - + ) ) def summation(s1: pd.Series, s2: pd.Series) -> pd.Series: return s1 + s2