Skip to content

Commit

Permalink
good place to touch base
Browse files Browse the repository at this point in the history
  • Loading branch information
jernejfrank committed Sep 28, 2024
1 parent b78374b commit 9c58a46
Show file tree
Hide file tree
Showing 10 changed files with 1,274 additions and 1 deletion.
7 changes: 6 additions & 1 deletion docs/reference/decorators/pipe.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
=======================
pipe
pipe family
=======================

We have a family of decorators that can help with transforming the input and output of a node in the DAG. For a hands on example have a look at https://github.com/DAGWorks-Inc/hamilton/tree/main/examples/scikit-learn/species_distribution_modeling
Expand All @@ -24,3 +24,8 @@ pipe_output
----------------
.. autoclass:: hamilton.function_modifiers.macros.pipe_output
:special-members: __init__

mutate
----------------
.. autoclass:: hamilton.function_modifiers.macros.mutate
:special-members: __init__
30 changes: 30 additions & 0 deletions examples/mutate/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Mutate

We demonstrate the ability to mutate the outputs of functions in a distributed manner with `@mutate.`

Mutate gives the ability to apply the same transformation to the each output of multiple functions in the DAG. It can be particularly useful in the following scenarios:

1. Loading data and applying pre-cleaning step.
2. Feature engineering via joining, filtering, sorting, etc.
3. Experimenting with different transformations across nodes by selectively turning transformations on / off.


and effectively replaces:
1. Having to have unique names and then changing wiring if you want to add/remove/replace something.
2. Enabling more verb like names on functions.
3. Potentially simpler "reuse" of transform functions across DAG paths...

# Modules
The same modules can be viewed and executed in `notebook.ipynb`.

We have three modules:
1. procedural.py
2. pipe_output.py
3. mutate.py

that demonstrate the same behavior achieved either without Hamilton, using `pipe_output` or `mutate` and that should give you some idea of a potential application.

![image info](./dag.png)

# Description
We have two complimentary decorators that can help with transforming input / output of a node in the DAG: `pipe_input` and `pipe_output`.
Binary file added examples/mutate/dag.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
50 changes: 50 additions & 0 deletions examples/mutate/mutate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from typing import Any, List

import pandas as pd

from hamilton.function_modifiers import mutate, source, value


def data_1() -> pd.DataFrame:
df = pd.DataFrame.from_dict({"col_1": [3, 2, pd.NA, 0], "col_2": ["a", "b", pd.NA, "d"]})
return df


def data_2() -> pd.DataFrame:
df = pd.DataFrame.from_dict(
{"col_1": ["a", "b", pd.NA, "d", "e"], "col_2": [150, 155, 145, 200, 5000]}
)
return df


def data_3() -> pd.DataFrame:
df = pd.DataFrame.from_dict({"col_1": [150, 155, 145, 200, 5000], "col_2": [10, 23, 32, 50, 0]})
return df


# data1 and data2
@mutate(data_1, data_2)
def _filter(some_data: pd.DataFrame) -> pd.DataFrame:
return some_data.dropna()


# data 2
# this is for value
@mutate(data_2, missing_row=value(["c", 145]))
def _add_missing_value(some_data: pd.DataFrame, missing_row: List[Any]) -> pd.DataFrame:
some_data.loc[-1] = missing_row
return some_data


# data 2
# this is for source
@mutate(data_2, other_data=source("data_3"))
def _join(some_data: pd.DataFrame, other_data: pd.DataFrame) -> pd.DataFrame:
return some_data.set_index("col_2").join(other_data.set_index("col_1"))


# data1 and data2
@mutate(data_1, data_2)
def _sort(some_data: pd.DataFrame) -> pd.DataFrame:
columns = some_data.columns
return some_data.sort_values(by=columns[0])
894 changes: 894 additions & 0 deletions examples/mutate/notebook.ipynb

Large diffs are not rendered by default.

56 changes: 56 additions & 0 deletions examples/mutate/pipe_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from typing import Any, List

import pandas as pd

from hamilton.function_modifiers import pipe_output, source, step, value


# data1 and data2
def _filter(some_data: pd.DataFrame) -> pd.DataFrame:
return some_data.dropna()


# data 2
# this is for value
def _add_missing_value(some_data: pd.DataFrame, missing_row: List[Any]) -> pd.DataFrame:
some_data.loc[-1] = missing_row
return some_data


# data 2
# this is for source
def _join(some_data: pd.DataFrame, other_data: pd.DataFrame) -> pd.DataFrame:
return some_data.set_index("col_2").join(other_data.set_index("col_1"))


# data1 and data2
def _sort(some_data: pd.DataFrame) -> pd.DataFrame:
columns = some_data.columns
return some_data.sort_values(by=columns[0])


@pipe_output(
step(_filter),
step(_sort),
)
def data_1() -> pd.DataFrame:
df = pd.DataFrame.from_dict({"col_1": [3, 2, pd.NA, 0], "col_2": ["a", "b", pd.NA, "d"]})
return df


@pipe_output(
step(_filter),
step(_add_missing_value, missing_row=value(["c", 145])),
step(_join, other_data=source("data_3")),
step(_sort),
)
def data_2() -> pd.DataFrame:
df = pd.DataFrame.from_dict(
{"col_1": ["a", "b", pd.NA, "d", "e"], "col_2": [150, 155, 145, 200, 5000]}
)
return df


def data_3() -> pd.DataFrame:
df = pd.DataFrame.from_dict({"col_1": [150, 155, 145, 200, 5000], "col_2": [10, 23, 32, 50, 0]})
return df
67 changes: 67 additions & 0 deletions examples/mutate/procedural.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from typing import Any, List

import pandas as pd


def data_1() -> pd.DataFrame:
df = pd.DataFrame.from_dict({"col_1": [3, 2, pd.NA, 0], "col_2": ["a", "b", pd.NA, "d"]})
return df


def data_2() -> pd.DataFrame:
df = pd.DataFrame.from_dict(
{"col_1": ["a", "b", pd.NA, "d", "e"], "col_2": [150, 155, 145, 200, 5000]}
)
return df


def data_3() -> pd.DataFrame:
df = pd.DataFrame.from_dict({"col_1": [150, 155, 145, 200, 5000], "col_2": [10, 23, 32, 50, 0]})
return df


# data1 and data2
def _filter(some_data: pd.DataFrame) -> pd.DataFrame:
return some_data.dropna()


# data 2
# this is for value
def _add_missing_value(some_data: pd.DataFrame, missing_row: List[Any]) -> pd.DataFrame:
some_data.loc[-1] = missing_row
return some_data


# data 2
# this is for source
def _join(some_data: pd.DataFrame, other_data: pd.DataFrame) -> pd.DataFrame:
return some_data.set_index("col_2").join(other_data.set_index("col_1"))


# data1 and data2
def _sort(some_data: pd.DataFrame) -> pd.DataFrame:
columns = some_data.columns
return some_data.sort_values(by=columns[0])


if __name__ == "__main__":
# print("Filter data 1")
# print(_filter(data_1()))
# print("Sort data 1")
print("Final data 1")
print(_sort(_filter(data_1())))
# print("Filter data 2")
# print(_filter(data_2()))
# print("Add missing value data 2")
# print(_add_missing_value(_filter(data_2()),missing_row=['c', 145]))
# print("Join data 2 and data 3")
# print(_join(_add_missing_value(_filter(data_2()),missing_row=['c', 145]),other_data=data_3()))
# print("Sort joined dataframe")
print("Final data 2")
print(
_sort(
_join(
_add_missing_value(_filter(data_2()), missing_row=["c", 145]), other_data=data_3()
)
)
)
67 changes: 67 additions & 0 deletions examples/mutate/simple_procedural.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from __future__ import annotations

from typing import Any, List

import pandas as pd


def data_1() -> pd.DataFrame:
df = pd.DataFrame.from_dict({"col_1": [3, 2, pd.NA, 0], "col_2": ["a", "b", pd.NA, "d"]})
return df


def data_2() -> pd.DataFrame:
df = pd.DataFrame.from_dict(
{"col_1": ["a", "b", pd.NA, "d", "e"], "col_2": [150, 155, 145, 200, 5000]}
)
return df


def data_3() -> pd.DataFrame:
df = pd.DataFrame.from_dict({"col_1": [150, 155, 145, 200, 5000], "col_2": [10, 23, 32, 50, 0]})
return df


# data1 and data2
def _filter(some_data: pd.DataFrame) -> pd.DataFrame:
return some_data.dropna()


# data 2
# this is for value
def _add_missing_value(some_data: pd.DataFrame, missing_row: List[Any]) -> pd.DataFrame:
some_data.loc[-1] = missing_row
return some_data


# data 2
# this is for source
def _join(some_data: pd.DataFrame, other_data: pd.DataFrame) -> pd.DataFrame:
return some_data.set_index("col_2").join(other_data.set_index("col_1"))


# data1 and data2
def _sort(some_data: pd.DataFrame) -> pd.DataFrame:
columns = some_data.columns
return some_data.sort_values(by=columns[0])


if __name__ == "__main__":
print("Filter data 1")
print(_filter(data_1()))
print("Sort data 1")
print(_sort(_filter(data_1())))
print("Filter data 2")
print(_filter(data_2()))
print("Add missing value data 2")
print(_add_missing_value(_filter(data_2()), missing_row=["c", 145]))
print("Join data 2 and data 3")
print(_join(_add_missing_value(_filter(data_2()), missing_row=["c", 145]), other_data=data_3()))
print("Sort joined dataframe")
print(
_sort(
_join(
_add_missing_value(_filter(data_2()), missing_row=["c", 145]), other_data=data_3()
)
)
)
1 change: 1 addition & 0 deletions hamilton/function_modifiers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
pipe = macros.pipe
pipe_input = macros.pipe_input
pipe_output = macros.pipe_output
mutate = macros.mutate
step = macros.step

# resolve transform/model decorator
Expand Down
Loading

0 comments on commit 9c58a46

Please sign in to comment.