DAGWorks-Inc · skrawcz · May 21, 2024 · May 16, 2024 · May 19, 2024 · May 19, 2024
diff --git a/docs/concepts/_snippets/config_when copy.png b/docs/concepts/_snippets/config_when copy.png
diff --git a/docs/concepts/_snippets/config_when.png b/docs/concepts/_snippets/config_when.png
diff --git a/docs/concepts/_snippets/decorator_ctx.png b/docs/concepts/_snippets/decorator_ctx.png
diff --git a/docs/concepts/_snippets/decorator_ctx.py b/docs/concepts/_snippets/decorator_ctx.py
@@ -0,0 +1,33 @@
+import pandas as pd
+import xgboost
+
+from hamilton.function_modifiers import load_from, save_to, source
+
+
+# source("data_path") allows to read the input value for `data_path`
+@load_from.parquet(path=source("data_path"))
+def preprocessed_df(raw_df: pd.DataFrame) -> pd.DataFrame:
+    """preprocess raw data"""
+    return ...
+
+
+@save_to.json(path=source("model_path"))
+def model(preprocessed_df: pd.DataFrame) -> xgboost.XGBModel:
+    """Train model on preprocessed data"""
+    return ...
+
+
+if __name__ == "__main__":
+    import __main__
+
+    from hamilton import driver
+
+    dr = driver.Builder().with_modules(__main__).build()
+
+    data_path = "..."
+    model_path = "..."
+    inputs = dict(data_path=data_path, model_path=model_path)
+    final_vars = ["save.model", "model"]
+    results = dr.execute(final_vars, inputs=inputs)
+    # results["model"]  <- the model
+    # results["save.model"] <- metadata from saving the model
diff --git a/docs/concepts/_snippets/dynamic_materializer_ctx.png b/docs/concepts/_snippets/dynamic_materializer_ctx.png
diff --git a/docs/concepts/_snippets/materializer_ctx.py → ...pts/_snippets/dynamic_materializer_ctx.py b/docs/concepts/_snippets/materializer_ctx.py → ...pts/_snippets/dynamic_materializer_ctx.py
@@ -18,16 +18,18 @@ def model(preprocessed_df: pd.DataFrame) -> xgboost.XGBModel:
     from hamilton import driver
     from hamilton.io.materialization import from_, to
 
-    # this registers DataSaver and DataLoader objects
-    from hamilton.plugins import pandas_extensions, xgboost_extensions  # noqa: F401
-
-    dr = driver.Builder().with_modules(__main__).build()
-
     data_path = "..."
     model_dir = "..."
     materializers = [
-        from_.parquet(path=data_path, target="raw_df"),
-        to.json(path=f"{model_dir}/model.json", dependencies=["model"], id="model__json"),
+        from_.parquet(target="raw_df", path=data_path),
+        to.json(
+            id="model__json",  # name of the DataSaver node
+            dependencies=["model"],
+            path=f"{model_dir}/model.json",
+        ),
     ]
-
-    dr.materialize(*materializers)
+    dr = driver.Builder().with_modules(__main__).build()
+    # executes all `to.` materializers; use `additional_vars` to execute other nodes
+    metadata, results = dr.materialize(*materializers, additional_vars=["model"])
+    # results["model"]  <- the model
+    # metadata["model__json"] <- metadata from saving the model
diff --git a/docs/concepts/_snippets/materializers.png b/docs/concepts/_snippets/materializers.png
diff --git a/docs/concepts/_snippets/node_ctx.py b/docs/concepts/_snippets/node_ctx.py
@@ -33,6 +33,5 @@ def save_model(model: xgboost.XGBModel, model_dir: str) -> None:
     model_dir = "..."
     inputs = dict(data_path=data_path, model_dir=model_dir)
     final_vars = ["save_model"]
-
     results = dr.execute(final_vars, inputs=inputs)
     # results["save_model"] == None
diff --git a/docs/concepts/_snippets/static_materializer_ctx.png b/docs/concepts/_snippets/static_materializer_ctx.png
diff --git a/docs/concepts/_snippets/static_materializer_ctx.py b/docs/concepts/_snippets/static_materializer_ctx.py
@@ -0,0 +1,34 @@
+import pandas as pd
+import xgboost
+
+
+def preprocessed_df(raw_df: pd.DataFrame) -> pd.DataFrame:
+    """preprocess raw data"""
+    return ...
+
+
+def model(preprocessed_df: pd.DataFrame) -> xgboost.XGBModel:
+    """Train model on preprocessed data"""
+    return ...
+
+
+if __name__ == "__main__":
+    import __main__
+
+    from hamilton import driver
+    from hamilton.io.materialization import from_, to
+
+    data_path = "..."
+    model_dir = "..."
+    materializers = [
+        from_.parquet(target="raw_df", path=data_path),
+        to.json(
+            id="model__json",  # name of the DataSaver node
+            dependencies=["model"],
+            path=f"{model_dir}/model.json",
+        ),
+    ]
+    dr = driver.Builder().with_modules(__main__).with_materializers(*materializers).build()
+    results = dr.execute(["model", "model__json"])
+    # results["model"]  <- the model
+    # results["model__json"] <- metadata from saving the model
diff --git a/docs/concepts/builder.rst b/docs/concepts/builder.rst
@@ -111,6 +111,58 @@ This is directly related to the ``@config`` function decorator (see :ref:`config
         .build()
     )
 
+    dr.display_all_functions("dag.png")
+
+
+.. image:: ./_snippets/config_when.png
+    :align: center
+
+
+with_materializers()
+____________________
+
+Adds `DataSaver` and `DataLoader` nodes to your dataflow. This allows to visualize these nodes using ``Driver.display_all_functions()`` and be executed by name with ``Driver.execute()``. More details on the :doc:`materialization` documentation page.
+
+.. code-block:: python
+
+    # my_dataflow.py
+    import pandas as pd
+    from hamilton.function_modifiers import config
+
+    def clean_df(raw_df: pd.DataFrame) -> pd.DataFrame:
+        return ...
+
+    def features_df(clean_df: pd.DataFrame) -> pd.DataFrame:
+        return ...
+
+.. code-block:: python
+
+    # run.py
+    from hamilton import driver
+    from hamilton.io.materialization import from_, to
+    import my_dataflow
+
+    loader = from_.parquet(target="raw_df", path="/my/raw_file.parquet")
+    saver = to.parquet(
+        id="features__parquet",
+        dependencies=["features_df"],
+        path="/my/feature_file.parquet"
+    )
+
+    dr = (
+        driver.Builder()
+        .with_modules(my_dataflow)
+        .with_materializers(loader, saver)
+        .build()
+    )
+    dr.display_all_functions("dag.png")
+
+    dr.execute(["features__parquet"])
+
+.. image:: ./_snippets/materializers.png
+    :align: center
+
+
 with_adapters()
 ---------------
 

diff --git a/docs/concepts/materialization.rst b/docs/concepts/materialization.rst
@@ -6,102 +6,117 @@ So far, we executed our dataflow using the ``Driver.execute()`` method, which ca
 
 On this page, you'll learn:
 
-- The difference between ``.execute()`` and ``.materialize()``
+- How to load and data in Hamilton
 - Why use materialization
-- What are DataSaver and DataLoader objects
+- What are ``DataSaver`` and ``DataLoader`` objects
+- The difference between ``.execute()`` and ``.materialize()``
 - The basics to write your own materializer
 
 Different ways to write the same dataflow
 -----------------------------------------
 
-Below are 3 ways to write a dataflow that:
+Below are 5 ways to write a dataflow that:
 
 1. loads a dataframe from a parquet file
 2. preprocesses the dataframe
 3. trains a machine learning model
 4. saves the trained model
 
-The first two options use ``Driver.execute()`` and the latter ``Driver.materialize()``. Notice where in the code data is loaded and saved and how it affects the dataflow.
+The first two options don't use the concept of materialization and the next three do.
+
+Without materialization
+-----------------------
 
-.. table:: Model training
+.. table::
    :align: left
 
-   +----------------------------------------------+-----------------------------------------------+--------------------------------------------------------+
-   | Nodes / dataflow context                     | Driver context                                | Materialization                                        |
-   +==============================================+===============================================+========================================================+
-   | .. literalinclude:: _snippets/node_ctx.py    | .. literalinclude:: _snippets/driver_ctx.py   | .. literalinclude:: _snippets/materializer_ctx.py      |
-   |                                              |                                               |                                                        |
-   +----------------------------------------------+-----------------------------------------------+--------------------------------------------------------+
-   | .. image:: _snippets/node_ctx.png            | .. image:: _snippets/driver_ctx.png           | .. image:: _snippets/materializer_ctx.png              |
-   |    :width: 500px                             |    :width: 500px                              |    :width: 500px                                       |
-   +----------------------------------------------+-----------------------------------------------+--------------------------------------------------------+
+   +----------------------------------------------+-----------------------------------------------+
+   | 1) From nodes                                | 2) From ``Driver``                            |
+   +==============================================+===============================================+
+   | .. literalinclude:: _snippets/node_ctx.py    | .. literalinclude:: _snippets/driver_ctx.py   |
+   |                                              |                                               |
+   +----------------------------------------------+-----------------------------------------------+
+   | .. image:: _snippets/node_ctx.png            | .. image:: _snippets/driver_ctx.png           |
+   |    :width: 500px                             |    :width: 500px                              |
+   +----------------------------------------------+-----------------------------------------------+
 
-As explained previously, ``Driver.execute()`` walks the graph to compute the list of nodes you requested by name. For ``Driver.materialize()``, you give it a list of data savers (``from_``) and data loaders (``to``). Each one will add a node to the dataflow before execution.
+Observations:
 
-.. note::
+1. These two approaches load and save data using ``pandas`` and ``xgboost`` without any Hamilton constructs. These methods are transparent and simple to get started, but as the number of node grows (or across projects) defining one node per parquet file to load introduces a lot of boilerplate.
+2. Using **1) from nodes** improves visibility by including loading & saving  in the dataflow (as illustrated).
+3. Using **2) from ``Driver``** facilitates modifying loading & saving before code execution when executing the code, without modifying the dataflow itself. It is particularly useful when moving from development to production.
 
-    ``Driver.materialize()`` can do everything ``Driver.execute()`` does, and more. It can receive ``inputs`` and ``overrides``. Instead of using ``final_vars``, you can use ``additional_vars`` to request nodes that you don't want to materialize/save.
+Limitations
+~~~~~~~~~~~~
 
-Why use materialization
------------------------
+Materializations aims to solve 3 limitations:
+
+1. **Redundancy**: deduplicate loading & saving code to improve maintainability and debugging
+2. **Observability**: include loading & saving in the dataflow for full observability and allow hooks
+3. **Flexibility**: change the loading & saving behavior without editing the dataflow
 
-Let's compare the benefits of the 3 different approaches
 
-Nodes / dataflow context
-~~~~~~~~~~~~~~~~~~~~~~~~
+With materialization
+--------------------
 
-This approach defines data loading and saving as part of the dataflow and uses ``Driver.execute()``. It is usually the simplest approach and the one you should start with.
+.. table::
+   :align: left
 
-Benefits
+   +-------------------------------------------------------------+-------------------------------------------------------------+-------------------------------------------------+
+   | 3) Static materializers                                     | 4) Dynamic materializers                                    | 5) Function modifiers                           |
+   +=============================================================+=============================================================+=================================================+
+   | .. literalinclude:: _snippets/static_materializer_ctx.py    | .. literalinclude:: _snippets/dynamic_materializer_ctx.py   | .. literalinclude:: _snippets/decorator_ctx.py  |
+   |                                                             |                                                             |                                                 |
+   +-------------------------------------------------------------+-------------------------------------------------------------+-------------------------------------------------+
+   | .. image:: _snippets/static_materializer_ctx.png            | .. image:: _snippets/dynamic_materializer_ctx.png           | .. image:: _snippets/decorator_ctx.png          |
+   |    :width: 500px                                            |    :width: 500px                                            |    :width: 500px                                |
+   +-------------------------------------------------------------+-------------------------------------------------------------+-------------------------------------------------+
 
-- the functions ``raw_df()`` and ``save_model()`` are transparent as to how they load/save data
-- can easily change data location using the strings ``data_path`` and ``model_dir`` as inputs
-- all operations are part of the dataflow
 
-Limitations
+Static materializers
+~~~~~~~~~~~~~~~~~~~~
 
-- need to write a unique function for each loaded parquet file and saved model. To reduce code duplication, one could write a utility function ``_load_parquet()``
-- can be too restrictive as to how to load data. Using ``override`` in the ``.execute()`` call can add flexibility.
+Passing ``from_`` and ``to`` Hamilton objects to ``Builder().with_materializers()`` injects into the dataflow standardized nodes to load and save data. It solves the 3 limitations highlighted in the previous section:
 
-Driver context
-~~~~~~~~~~~~~~
+1. Redundancy ✅: Using the ``from_`` and ``to`` Hamilton constructs reduces the boilerplate to load and save data from common formats (JSON, parquet, CSV, etc.) and to interact with 3rd party libraries (pandas, matplotlib, xgboost, dlt, etc.)
+2. Observability ✅: Loaders and savers are part of the dataflow. You can view them with ``Driver.display_all_functions()`` and execute nodes by requesting them with ``Driver.execute()``.
+3. Flexibility ✅: The loading and saving behavior is decoupled from the dataflow and can modified easily when creating the ``Driver`` and executing code.
 
-This approach loads and saves data outside the dataflow and uses ``Driver.execute()``. Since the Driver is responsible for executing your dataflow, it makes sense to handle data loading/saving in the context of the "driver code" (e.g., ``run.py``) if they change often.
 
-Benefits
+Dynamic materializers
+~~~~~~~~~~~~~~~~~~~~~
 
-- Driver users is responsible for loading/saving data
-- fewer dataflow functions to define and maintain
-- the functions for ``raw_df()`` and ``save_model()`` can live in another Python module that you can optionally build the Driver with.
+The dataflow is executed by passing ``from_`` and ``to`` objects to ``Driver.materialize()`` instead of the regular ``Driver.execute()``. This approach ressembles **2) from Driver**:
 
-Limitations
+.. note::
 
-- add complexity to the "driver code".
-- lose the benefits of Hamilton for loading and saving operations (visualize, lifecycle hook, etc.)
-- to add flexibility to data loading/saving, one can adopt the **nodes/dataflow context** approach and add functions with ``@config`` for alternative implementations (see :ref:`config-decorators`).
+   ``Driver.materialize()`` can receive data savers (``from_``) and loaders (``to``) and will execute all ``to`` passed. Like ``Driver.execute()``, it can receive ``inputs``, and ``overrides``, but instead of ``final_vars`` it receives ``additional_vars``.
 
+1. Redundancy ✅: Uses ``from_`` and ``to`` Hamilton constructs.
+2. Observability 🚸: Materializers are visible with ``Driver.visualize_materialization()``, but can't be introspected otherwise. Also, you need to rely on ``Driver.materialize()`` which has a different call signature.
+3. Flexibility ✅: Loading and saving is decoupled from the dataflow.
 
-Materialization
-~~~~~~~~~~~~~~~
+.. note::
 
-This approach tries to strike a balance between the two previous methods and uses ``Driver.materialize()``.
+   Using static materializers is typically preferrable. Static and dynamic materializers can be used together with ``dr = Builder.with_materializers().build()`` and later ``dr.materialize()``.
 
-Unique benefits
+Function modifiers
+~~~~~~~~~~~~~~~~~~
 
-- Use the Hamilton logic to combine nodes (more on that later)
-- Get tested code for common data loading and saving out-of-the-box (e.g., JSON, CSV, Parquet, pickle)
-- Easily save the same node to multiple formats
+By adding ``@load_from`` and ``@save_to`` function modifiers (:ref:`loader-saver-decorators`) to Hamilton functions, materializers are generated when using ``Builder.with_modules()``. This approach ressembles **1) from Driver**:
 
-Benefits
+.. note::
 
-- Flexibility for Driver users to change data location
-- Less dataflow functions to define and maintain
-- All operations are part of the dataflow
+   Under the hood, the ``@load_from`` modifier uses the same code as ``from_`` to load data, same for ``@save_to`` and ``to``.
 
-Limitations
+1. Redundancy 🚸: Using ``@load_from`` and ``@save_to`` reduces redundancy. However, to make available to multiple nodes a loaded table, you would need to decorate each node with the same ``@save_to``. Also, it might be impractical to decorate dynamically generated nodes (e.g., when using the ``@parameterize`` function modifier).
+2. Observability ✅: Loaders and savers are part of the dataflow.
+3. Flexibility 🚸: You can modify the path and materializer kwargs at runtime using ``source()`` in the decorator definition, but you can't change the format itself (e.g., from parquet to CSV).
+
+.. note::
+
+   It can be desirable to couple loading and saving to the dataflow using function modifiers. It makes it clear when reading the dataflow definition which nodes should load or save data using external sources.
 
-- Writing a custom DataSaver or DataLoader requires more effort than adding a function to the dataflow.
-- Adds *some* complexity to the Driver (e.g., ``run.py``).
 
 DataLoader and DataSaver
 ------------------------
@@ -118,8 +133,3 @@ Here are simplified snippets for saving and loading an XGBoost model to/from JSO
    +----------------------------------------------+-----------------------------------------------+
 
 To define your own DataSaver and DataLoader, the Hamilton `XGBoost extension <https://github.com/DAGWorks-Inc/hamilton/blob/main/hamilton/plugins/xgboost_extensions.py>`_ provides a good example
-
-``@load_from`` and ``@save_to``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Also, the data loaders and savers power the ``@load_from`` and ``@save_to`` :ref:`loader-saver-decorators`