From c203dfc5674b7e51d80eb103681d256cfcc906d0 Mon Sep 17 00:00:00 2001 From: elijahbenizzy Date: Sat, 12 Aug 2023 16:43:30 -0700 Subject: [PATCH 1/8] Fixes materialization viz function It now: 1. Works without additional_vars included 2. Returns the graphviz object for rendering in a notebook --- hamilton/driver.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hamilton/driver.py b/hamilton/driver.py index 28e738b88..004534624 100644 --- a/hamilton/driver.py +++ b/hamilton/driver.py @@ -997,9 +997,9 @@ def materialize( def visualize_materialization( self, *materializers: materialization.MaterializerFactory, - additional_vars: List[Union[str, Callable, Variable]], output_file_path: str, render_kwargs: dict, + additional_vars: List[Union[str, Callable, Variable]] = None, inputs: Dict[str, Any] = None, graphviz_kwargs: dict = None, ) -> Optional["graphviz.Digraph"]: # noqa F821 @@ -1014,11 +1014,13 @@ def visualize_materialization( :param graphviz_kwargs: Arguments to pass to graphviz :return: The graphviz graph, if you want to do something with it """ + if additional_vars is None: + additional_vars = [] function_graph = materialization.modify_graph(self.graph, materializers) _final_vars = self._create_final_vars(additional_vars) + [ materializer.id for materializer in materializers ] - Driver._visualize_execution_helper( + return Driver._visualize_execution_helper( function_graph, self.adapter, _final_vars, From 2be6fcd2c0099f9784c948788e6c9c44f557d057 Mon Sep 17 00:00:00 2001 From: elijahbenizzy Date: Sat, 12 Aug 2023 16:45:30 -0700 Subject: [PATCH 2/8] Fixes class hierarchy in dependencies.py --- hamilton/function_modifiers/dependencies.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hamilton/function_modifiers/dependencies.py b/hamilton/function_modifiers/dependencies.py index 1d3dcbd7f..575ed7188 100644 --- a/hamilton/function_modifiers/dependencies.py +++ b/hamilton/function_modifiers/dependencies.py @@ -29,7 +29,7 @@ class SingleDependency(ParametrizedDependency, abc.ABC): @dataclasses.dataclass -class LiteralDependency(ParametrizedDependency): +class LiteralDependency(SingleDependency): value: Any def get_dependency_type(self) -> ParametrizedDependencySource: @@ -37,7 +37,7 @@ def get_dependency_type(self) -> ParametrizedDependencySource: @dataclasses.dataclass -class UpstreamDependency(ParametrizedDependency): +class UpstreamDependency(SingleDependency): source: str def get_dependency_type(self) -> ParametrizedDependencySource: From 6f6ed2d2c6a1458aa21636b981a2c04c250081aa Mon Sep 17 00:00:00 2001 From: elijahbenizzy Date: Sat, 12 Aug 2023 16:45:46 -0700 Subject: [PATCH 3/8] Fixes duplication of node dependencies This was added when we added graph copying. This adds the ability for update_dependencies to wipe the dependencies before executing. Note that it does an in-place modification if this doesn't happen -- that's the reset_dependencies option. Note this is an internal API so I don't mind it being geared towards performance. --- hamilton/graph.py | 15 ++++++++++----- hamilton/node.py | 23 ++++++++++++++++++++--- tests/test_graph.py | 26 +++++++++++++++++--------- 3 files changed, 47 insertions(+), 17 deletions(-) diff --git a/hamilton/graph.py b/hamilton/graph.py index 2f82fd645..57ec99ca1 100644 --- a/hamilton/graph.py +++ b/hamilton/graph.py @@ -68,19 +68,23 @@ def add_dependency( def update_dependencies( - nodes: Dict[str, node.Node], adapter: base.HamiltonGraphAdapter, in_place: bool = True + nodes: Dict[str, node.Node], adapter: base.HamiltonGraphAdapter, reset_dependencies: bool = True ): - """Adds dependecies to a dictionary of nodes. If in_place is False, + """Adds dependencies to a dictionary of nodes. If in_place is False, it will deepcopy the dict + nodes and return that. Otherwise it will mutate + return the passed-in dict + nodes. :param in_place: Whether or not to modify in-place, or copy/return :param nodes: Nodes that form the DAG we're updating :param adapter: Adapter to use for type checking + :param reset_dependencies: Whether or not to reset the dependencies. If they are not set this is + unnecessary, and we can save yet another pass. Note that `reset` will perform an in-place + operation. :return: The updated nodes """ - if not in_place: - nodes = {k: v for k, v in nodes.items()} + # copy without the dependencies to avoid duplicates + if reset_dependencies: + nodes = {k: v.copy(include_refs=False) for k, v in nodes.items()} for node_name, n in list(nodes.items()): for param_name, (param_type, _) in n.input_types.items(): add_dependency(n, node_name, nodes, param_name, param_type, adapter) @@ -118,7 +122,8 @@ def create_function_graph( ) nodes[n.name] = n # add dependencies -- now that all nodes exist, we just run through edges & validate graph. - update_dependencies(nodes, adapter) # in place + nodes = update_dependencies(nodes, adapter, reset_dependencies=False) # no dependencies + # present yet for key in config.keys(): if key not in nodes: nodes[key] = node.Node(key, Any, node_source=node.NodeType.EXTERNAL) diff --git a/hamilton/node.py b/hamilton/node.py index 50f4a5c9b..3bba90aa6 100644 --- a/hamilton/node.py +++ b/hamilton/node.py @@ -108,7 +108,7 @@ def __init__( DependencyType.from_parameter(value), ) elif self.user_defined: - if input_types is not None: + if len(self._input_types) > 0: raise ValueError( f"Input types cannot be provided for user-defined node {self.name}" ) @@ -266,11 +266,12 @@ def from_fn(fn: Callable, name: str = None) -> "Node": node_source=node_source, ) - def copy_with(self, **overrides) -> "Node": + def copy_with(self, include_refs: bool = True, **overrides) -> "Node": """Copies a node with the specified overrides for the constructor arguments. Utility function for creating a node -- useful for modifying it. :param kwargs: kwargs to use in place of the node. Passed to the constructor. + :param include_refs: Whether or not to include dependencies and depended_on_by :return: A node copied from self with the specified keyword arguments replaced. """ constructor_args = dict( @@ -284,4 +285,20 @@ def copy_with(self, **overrides) -> "Node": originating_functions=self.originating_functions, ) constructor_args.update(**overrides) - return Node(**constructor_args) + out = Node(**constructor_args) + if include_refs: + out._dependencies = self._dependencies + out._depended_on_by = self._depended_on_by + return out + + def copy(self, include_refs: bool = True) -> "Node": + """Copies a node, not modifying anything (except for the references + /dependencies if specified). + + :param include_refs: Whether or not to include dependencies and depended_on_by + :return: A copy of the node. + """ + """Gives a copy of the node, so we can modify it without modifying the original. + :return: A copy of the node. + """ + return self.copy_with(include_refs) diff --git a/tests/test_graph.py b/tests/test_graph.py index 0431b0893..96d908506 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -220,15 +220,6 @@ def test_add_dependency_user_nodes(): assert func_node.depended_on_by == [] -def test_create_function_graph_simple(): - """Tests that we create a simple function graph.""" - expected = create_testing_nodes() - actual = graph.create_function_graph( - tests.resources.dummy_functions, config={}, adapter=base.SimplePythonDataFrameGraphAdapter() - ) - assert actual == expected - - def create_testing_nodes(): """Helper function for creating the nodes represented in dummy_functions.py.""" nodes = { @@ -275,6 +266,15 @@ def create_testing_nodes(): return nodes +def test_create_function_graph_simple(): + """Tests that we create a simple function graph.""" + expected = create_testing_nodes() + actual = graph.create_function_graph( + tests.resources.dummy_functions, config={}, adapter=base.SimplePythonDataFrameGraphAdapter() + ) + assert actual == expected + + def test_execute(): """Tests graph execution along with basic memoization since A is depended on by two functions.""" adapter = base.SimplePythonDataFrameGraphAdapter() @@ -800,3 +800,11 @@ def my_function(A: int, b: int, c: int) -> int: ) results = fg.execute([n for n in fg.get_nodes() if n.name in ["my_function", "A"]]) assert results == {"A": 4, "b": 3, "c": 1, "my_function": 8} + + +def test_update_dependencies(): + nodes = create_testing_nodes() + new_nodes = graph.update_dependencies(nodes, base.DefaultAdapter()) + for node_name, node_ in new_nodes.items(): + assert node_.dependencies == nodes[node_name].dependencies + assert node_.depended_on_by == nodes[node_name].depended_on_by From af2e6ef6c62ede0dd5f341a54a822c85ee56a98f Mon Sep 17 00:00:00 2001 From: elijahbenizzy Date: Sat, 12 Aug 2023 16:47:35 -0700 Subject: [PATCH 4/8] Fixes materialize bug This was referring to data loaders instead of data savers in the registry. --- hamilton/io/materialization.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/hamilton/io/materialization.py b/hamilton/io/materialization.py index 3688d7a6f..0bf6fc742 100644 --- a/hamilton/io/materialization.py +++ b/hamilton/io/materialization.py @@ -7,7 +7,7 @@ from hamilton.function_modifiers.dependencies import SingleDependency, value from hamilton.graph import FunctionGraph from hamilton.io.data_adapters import DataSaver -from hamilton.registry import LOADER_REGISTRY +from hamilton.registry import SAVER_REGISTRY class materialization_meta__(type): @@ -19,17 +19,17 @@ class in registry, or make it a function that just proxies to the decorator. We """ def __getattr__(cls, item: str): - if item in LOADER_REGISTRY: - potential_loaders = LOADER_REGISTRY[item] + if item in SAVER_REGISTRY: + potential_loaders = SAVER_REGISTRY[item] savers = [loader for loader in potential_loaders if issubclass(loader, DataSaver)] if len(savers) > 0: - return Materialize.partial(LOADER_REGISTRY[item]) + return Materialize.partial(SAVER_REGISTRY[item]) try: return super().__getattribute__(item) except AttributeError as e: raise AttributeError( - f"No loader named: {item} available for {cls.__name__}. " - f"Available loaders are: {LOADER_REGISTRY.keys()}. " + f"No data materializer named: {item}. " + f"Available materializers are: {SAVER_REGISTRY.keys()}. " f"If you've gotten to this point, you either (1) spelled the " f"loader name wrong, (2) are trying to use a loader that does" f"not exist (yet)" @@ -76,6 +76,7 @@ def _process_kwargs( """ processed_kwargs = {} for kwarg, kwarg_val in data_saver_kwargs.items(): + if not isinstance(kwarg_val, SingleDependency): processed_kwargs[kwarg] = value(kwarg_val) else: From ae6636e0887a130985f3295e7b8adac86b1b7e49 Mon Sep 17 00:00:00 2001 From: elijahbenizzy Date: Sat, 12 Aug 2023 16:59:25 -0700 Subject: [PATCH 5/8] Adds examples for materializers We do the ML model example, and add some custom ones. Hopefully this gets people started. We have an easy script to run + a notebook. --- examples/materialization/README.md | 63 +++ .../materialization/custom_materializers.py | 55 ++ examples/materialization/dag.pdf | Bin 0 -> 28559 bytes examples/materialization/data_loaders.py | 30 + examples/materialization/model_training.py | 96 ++++ examples/materialization/notebook.ipynb | 535 ++++++++++++++++++ examples/materialization/requirements.txt | 2 + examples/materialization/run.py | 87 +++ 8 files changed, 868 insertions(+) create mode 100644 examples/materialization/README.md create mode 100644 examples/materialization/custom_materializers.py create mode 100644 examples/materialization/dag.pdf create mode 100644 examples/materialization/data_loaders.py create mode 100644 examples/materialization/model_training.py create mode 100644 examples/materialization/notebook.ipynb create mode 100644 examples/materialization/requirements.txt create mode 100644 examples/materialization/run.py diff --git a/examples/materialization/README.md b/examples/materialization/README.md new file mode 100644 index 000000000..24cf6fe2a --- /dev/null +++ b/examples/materialization/README.md @@ -0,0 +1,63 @@ +# Materialization + +Hamilton's driver allows for ad-hoc materialization. This enables you to take a DAG you already have, +and save your data to a set of custom locations/url. + +Note that these materializers are _isomorphic_ in nature to the +[@save_to](https://hamilton.dagworks.io/en/latest/reference/decorators/save_to/) +decorator. Materializers inject the additional node at runtime, modifying the +DAG to include a data saver node, and returning the metadata around materialization. + +This framework is meant to be highly pluggable. While the set of available data savers is currently +limited, we expect folks to build their own materializers (and, hopefully, contribute them back to the community!). + + +## example +In this example we take the scikit-learn iris_loader pipeline, and materialize outputs to specific +locations through a driver call. We demonstrate: + +1. Saving model parameters to a json file (using the default json materializer) +2. Writing a custom data adapters for: + 1. Pickling a model to an object file + 2. Saving confusion matrices to a csv file + +See [run.py](run.py) for the full example. + + +## `driver.materialize` + +This will be a high-level overview. For more details, +see [documentation](https://hamilton.dagworks.io/en/latest/reference/drivers/Driver/#hamilton.driver.Driver.materializehttps://hamilton.dagworks.io/en/latest/reference/drivers/Driver/#hamilton.driver.Driver.materialize). + +`driver.materialize()` does the following: +1. Processes a list of materializers to create a new DAG +2. Alters the output to include the materializer nodes +3. Processes a list of "additional variables" (for debugging) to return intermediary data +4. Executes the DAG, including the materializers +5. Returns a tuple of (`materialization metadata`, `additional variables`) + +Materializers each consume: +1. A `dependencies` list to materialize +2. A (optional) `combine` parameter to combine the outputs of the dependencies +(this is required if there are multiple dependencies). This is a [ResultMixin](https://hamilton.dagworks.io/en/latest/concepts/customizing-execution/#result-builders) object +3. an `id` parameter to identify the materializer, which serves as the nde name in the DAG + +Materializers are referenced by the `to` object in `hamilton.io.materialization`, which utilizes +dynamic dispatch to create the appropriate materializer. + +These refer to a `DataSaver`, which are keyed by a string (E.G `csv`). +Multiple data adapters can share the same key, each of which applies to a specific type +(E.G. pandas dataframe, numpy matrix, polars dataframe). New +data adapters are registered by calling `hamilton.registry.register_adapter` + +## Custom Materializers + +To define a custom materializer, all you have to do is implement the `DataSaver` class +(which will allow use in `save_to` as well.) This is demonstrated in [custom_materializers.py](custom_materializers.py). + +## `driver.materialize` vs `@save_to` + +`driver.materialize` is an ad-hoc form of `save_to`. You want to use this when you're developing, and +want to do ad-hoc materialization. When you have a production ETL, you can choose between `save_to` and `materialize`. +If the save location/structure is unlikely to change, then you might consider using `save_to`. Otherwise, `materialize` +is an idiomatic way of conducting the maerialization operations that cleanly separates side-effects from transformations. diff --git a/examples/materialization/custom_materializers.py b/examples/materialization/custom_materializers.py new file mode 100644 index 000000000..a2508ae43 --- /dev/null +++ b/examples/materialization/custom_materializers.py @@ -0,0 +1,55 @@ +import dataclasses +import pickle +from typing import Any, Collection, Dict, Type + +import numpy as np +from sklearn import base + +from hamilton import registry +from hamilton.io import utils +from hamilton.io.data_adapters import DataSaver + +# TODO -- put this back in the standard library + + +@dataclasses.dataclass +class NumpyMatrixToCSV(DataSaver): + path: str + sep: str = "," + + def __post_init__(self): + if not self.path.endswith(".csv"): + raise ValueError(f"CSV files must end with .csv, got {self.path}") + + def save_data(self, data: np.ndarray) -> Dict[str, Any]: + np.savetxt(self.path, data, delimiter=self.sep) + return utils.get_file_metadata(self.path) + + @classmethod + def applicable_types(cls) -> Collection[Type]: + return [np.ndarray] + + @classmethod + def name(cls) -> str: + return "csv" + + +@dataclasses.dataclass +class SKLearnPickler(DataSaver): + path: str + + def save_data(self, data: base.ClassifierMixin) -> Dict[str, Any]: + pickle.dump(data, open(self.path, "wb")) + return utils.get_file_metadata(self.path) + + @classmethod + def applicable_types(cls) -> Collection[Type]: + return [base.ClassifierMixin] + + @classmethod + def name(cls) -> str: + return "pickle" + + +for adapter in [NumpyMatrixToCSV, SKLearnPickler]: + registry.register_adapter(adapter) diff --git a/examples/materialization/dag.pdf b/examples/materialization/dag.pdf new file mode 100644 index 0000000000000000000000000000000000000000..3d123bb076545c5c435aa13b3afdbc0026a9c92b GIT binary patch literal 28559 zcmZs>1CS;`(=Ix;?Rm$xZQHi7qaE9}ZQIzfZQHgzd-wZs;@lgzyCbXf>8#4EimHl! zvYSj%RGf~9o(+bq<+}D0h84gFus61b;pGJ|$eP(%I9meP{z1wx004kN+{)J3%;{fg zYvgPuYGz_@Y6ip45A)mE$;{{%jC=O9_GB!68}jz6#*>qI0sC%T-<`2PAUuFJ(GppZ z5$;{O@xYQ+Eb)-jAn$jjr0cHL^Kwp~B*@Jrv^fpP%Ah_}s@ZmZ%7qF6(V>NZ-7bb)lG)6f5q{HLIWBpXIN+ zvu+9$xELnUumwI}fPNF3C*5(0DHErL$HmX{C=G=v)GJHh4{J}?$4{M0JKwIB_mk}! zIu3#R*-;%EJKx`*(~EWQjra_H?^%=)6&};l#8xD`2QxB0WhjHB=gYWt~ z8_th(n2G1RkLQm%xb{ENC_I#?gl)?P4-vAI#k@$$u2iTnyAtci_Ib6Qz7Qews#a50 z4J)%VoR+^8S0dSqW}`UJoj86|sliTA3{Z zXFN44^H&t|RMA`(#}aO)W<4qG%2;e(Jz0N-8`96r@q0Iiz1*kiq#@Yu7s#u$6bjiL z`&bm6LrpBj&mEG5yV4i9oCc^AU02-jo^IS7E9}}%Y7)dl(Tr7~LdEYOL zy+fizr{V!D21Ljd{7n`?vw}mZ2`Hg@B%_7X@wkVUdWk{8dRmP_w6J#lX>$i;&EgHg zNU52Ryd8LC2g5*w!{WMhG7)e2XQo>w>L#&;g3(`B{%whG{I1B~!n5~dg-_bvihcaL zsTq`9?}hlg+p6%KEFhlbeB}qjys(ARfrH%1sAM)hC;w)Ay-HZh5p zD2#D-5F!XROQgeyHu4V>`@&l5W>*i0VE8=_h;D5d`}d?H_Zl{(Yo1Er&tPIcg(QK| zmKUvH#(*~QAeS^cq~Ee8273ov!4K0?Z`tw?*Wj{uwet5|WMq|2gnRKKj^ubRI2FU7 zH=c?l1hy8l(9QjrGEAUhOEz$*239MKh-~OES?7_U3>5)Su$luH261ZRzlmbVB-320 zHIbh=14_5z(Ew?1IHJre>?Ng@wZX!!vXf-J(9NW#>dM#ha zj)6FwGK)JIpHWSfT8u;!-4uRuWn) zMAS#bcw0HgK4&UWUYnt+W@2V@YJ%9lf8r)>1`SbHH}i%i*PJbl;RXpwb$^MabRKo%D7Z(RDQAmBsYs!;j8H6} zmdvNB;5a5-4o+~MbgVNQS<=FxLoP)vSjLWlzt+p6myEZW1Y6`_gim#G)3{_Fp2O{M z)=ih}zxPHXntd(kTpMn3NjY&XY3_)5(@$y+GgH>j(xHuRM|y%Tl!puF4w8e^HsLN` zxk6C4^^v9Cw$rg&knmy1`PH;2k=*%~F@3tRO^z2XOTc1;`jyCylHzC<9;c$AJ{le4kPNxQE|(N8%blc4q}c$-vNLx zC=*qItg+c|JnYP34xZE(e(q4NOt*DTThiu`U#5_Td(xACj`&dc6I-BwfA(>usxoC* zLiHjgmiLQm`E*UjWb@|9v&yYOqF&|$EnWz%ed9-=M%*5ev}b#lUuZ{rH(?az_@_hT zD;%C~>^Y#2&WJ*b*}fPS8856oc%o-Qr!=He&A$}9Ir(axXXz{V){Z)vCembDE65s} z(nQBQX`O2H*USWxRgF~nP+;d9n|(__syW1R;+&ya4^EQwjrF7z5%bGJ)b_M|-3qVG zz{z-;(<->a4drkg33=s5iP>lUs+-UaHRCO@8fx}uLWaP#02W>@lW9>|}0vCLtfG|jOfRhE% z9!;}M=4dQDJiM};piOE0NInj~5?wR~92kQ%HLfOTfHpO`a)x0fB2D!?1P%yOd8h)M z`qHO~asbPt9X-Ie>VCj)@^QIbF$6@?K)fub>{@LIp@didTQ*gO?fG` zMj-D>aM)N&nX3LI)=k*B6)tol@E>DfKP-A_S7RG@nNm@t{4OOJYePy#=C=sv#wn?V zrtEuw5SpvsZ)5cVu-M4l-QKPZ_;)etv!l4}JJ#Sw-ELUsQ~_eGerO*4#+Y8_msTDt z5nrBK6m)YBo`bi_=C~LJ8V?+s9}|N+0DvWw&AU!zx=>^uA?}1=29GFTnMjgIA>C6} z3%>xKH49(0>8u~d2GJsRF4g8ue^+Mom7S-%XDF`qus02Fe_7I5FI=nSMf|ym<{r5f zN~psc2Miq#hvjrAziQqtDFBX!A=CleB(9%6`-K~l%~vu}i1)K^uLQXn#gz#8yg}&@ zGMOuZ1^oQAWMucgz$$2~IKQt6A_>_=xVc#owciw2AAeRevDtIp+NqU`e)vz(1Ke>+ z_6^`}vD1=mEH>n<@m3IfqVWJ)$Lrjtgm?3vND6m_L9|N|vP8nmprOYrZ)xJM&h4f| z_YQgn^7DtJol@OAo7IvEbB2tM;kY&s+2UtMzr|p<_x=@rj5p%@%^WT_kd;)yt(AQ)|5k|2(7PU|NuJu> zQ^yX)>8hmYE-ObLkz(2L%K=(pbwJkkdf1~DJPi~srb^9v73ft;aC zZi5RPO`RMvVCh|{9??`OJ;1096G;#X7Uk*Xo>DPBB0WVS-n8!zW`5XSO&ttRZvrv| zxB*m#P2|qwwp;}8u#|rNi~zSHspd2K^IXvHC^qKm3YQqNM3$L_nib`wY<*keiQN&S zt4W{edKl}Fi67?@ZNE&gPBBg~NVS%NaB(3Ficz;>;(~=45Mgqj^STAkg%{5;JK3S# zvIek<)-Tqz*3Qlz3UCz>V`07VbDP9_9boABi}NA@qHv(&+@_loDlZMZ6~uVD%h)frZkxE+ zg|54MF+xiajS%k|UsZPV)P!A2hwNts%5tt^92L3WR{K zhc_!W788=B88#-n0Z-H6$;gzGh?tzEdZEfLCyIk!bb`=nnj6`WU6FUY_tCI~7o)hk zd0V80mP)BsV2VL3+j-xyY9Y90XP}br;+gN`9y>#*igpAXuS605%HKXeQ|-N<*YdzE zR;a`j+i>z!D!wgg`sMZ!UJhpl?@MciB1(v9m_PQqBT9slmj)rv!WiX|H?8jN;t^nx#;u32e(dXVYFN>QS zQY&MQn_BJ!#4!sZu6a7f`pPpCAzFo#kUq)rqMlOG*69@YMY2JO4$A;R8u62MsP6|7 zv^wgOtSxy;4Dl@Q&;S~vX8*d+HRw< zHf#JTIrlqKNP?>z%k+=4B649uitq&_LJ-5NNd(f@I?u)i@6jQ$+VL+$VREO_SjF~2 z7SB~S4VpPhpC#_x>U|Yz2Ka`LMhW60DiUU1rF_cKw_!0UxCC-ZOom|Cqdyr*7s8_L zCWl51Agf&~OTuKWq(0Ca)X*tiUqewE^7f@<`h5q(0+#Q>&EQi0r^p(a=F^+NjE3DvZie)U3YxC-=q3;0ouaG(e4rD59Zrca<~ zAvzx1V_w>7F#XVM^1#>2wXm}WRbMhqF{7sxZ=SQh@^&-JuHhnhx#m@?f@a0$OiG)X z@qC25Rn556po@rNHNpTz5NfR&km2GTjVWt%u;;j%K;5OOOu6OLVZ8 zPPMD|aGasxd({fl4y1Ap@aIgW6dPHkS~m1`RNz=kK@qCE7EkwHcAU&^R!VmRoH|O* z@KM`HVHA@}HK7CZ_r66QV@_t!P;1J8Gg>a`yb%n!C=JrPXRE92mzP? z3?@eZ5y1YxEdDFgc^uKi(_slaCsU$yK-S=ll12PHwHIo`jp)bCuh2l{gj=X+H80vBx7@7R2B4cgPK7?afG} zCf`Swyflm7x4FXQw*@6cDhe?t3(>~Lx&_C7+Z+U$3N+k+_fKTz9y z&NJmW;3^3<{(=hY2q3OL?A@6K4Zj+&_p$NrF2!J7g`9e-L1|AG9YbP$!99``i}OBi z2^AlZrxW5EC-kB2F^bd2Xg}i}v4jsp7Y}Rf%fu(63E1UX84%^#8SpR0N&`h%04&GA zr-golfm|KWVQ1v<7M0JHNuH1wkYTa)yKm)dhk~4Wn0YB8iToqXgxoVA0Svkwj zm*FheVlt(snc%DTC7EKf&=zHJnr@NRr1+vTlQw<&%K1aa~drt9X0Ux*Z&6EmD}gjjeb3^aV|m(NJt1I3v^ zuDb;kK5nv8IsBs1KzBDp%IN+AR7%QQ0p87xaXVuY_WLrDM;Oo~71{oU%?LncXL>+y zZ%9<|eZ4dw6$yg+DYX}N#0TTn{KSc~T^z(E<=4>cj<%^nB}&ef*=ghz z@_4CwlCs%#BE_=h7It|Y!h}fdS$E|V zFyPTJ9)j}%K_`K)lsS@ql|ru+r(dE#`?*`?kneNQ!@r~a%z@dcG8cqu`bf;|&K-*t zjF4{}I#8N2V}qBGOvPE>wL@gF24yl$SK-LB^z&A^qsa9*_yFd!uq5Hg+Y^Q?jd}YE zgVPQ3ZIU8aPR1x89uTjIS@7ch(x-Dc!SrAX;8 ztOodYyE{*<*Nv9gXw!2n)H15oi#A@e%-yPC z+q?E1Xsxv)>UO!FO0*2t_aB1?qgwZt5$RpC27V_SdR+tQ_p{8Mt8JzrdF=i|bb0&{G%|=-<0i^74@kz>R~;XaH~Y?L6U_-xdg{ z!M12XB5MmeH}zp{zjqr_NH< zhBUQ?On|2b7-5C}&1Xsjv=bMQ1B_v^0|>{sj83|c(-P@NPi0(Xjm$b(ANi)iE|O=n z>7cK$Out1Or^k=Xk4KGDE;E7|>oxZQ;}lfi)n6_v!3L2binQ&_ClY$mWY`uR(QDCC zGFZ9I<7C*lxF7@NeQ;1D1BW0qUrOfHsP0t<3CBBohVj_Y#%O3kheE=YYW)GNn4VCj zC}8MRr3ebB%zW_MIP5%8#^^FyMQaEGk(ExZeYmU%-l~MtVlduC(M=Fgna$J8c5`N? z!HH731w8&;B7*V}w(zcUxzwZ*s{YV4x|Af+Am3(uJx7)mYZi@W!p`rJ?NE4wUh26)V+Jbfs?S>Rf z*F{glXPRJPO*f`n8FzG&s#lv=a(>omcnvl#k-RJg-^O5|vLKpDL~B~Q!}DJZlM<3? z4_y(bDqI;Cm%5h0cHymG9v=j&VFQ`?w?_oJ0b!}3lqskydQwXjjk_gE2#8pk;nxB6 z5`d}GotfsC!bxO1tWMCmXo@gm!UpdcNA7C6{yv(RPQYsuT-d^hA*WVpq^qJ@m{hX7 zHlXTmjn5Elgt{XgZ_zZKr*2?RLIO#6z(kQn2~|ZY6XnL2zD+KJ`oVOFMzZ5#N5YMc z8Ex;XC7UB>K%h-kvx{*d>%Z8O_9Mku7;=ETU3DJ+6RyA#(1~8V^J?K2U&z#av+Kduyl6dk!CY$2eg+t5OV=L9%T02k; zfwAtpv`jW;Hh>a_AewPNr>W196OsU!XMd}F{K@*0hFH=?7v(77N97@>;K=7SJ9TLi z)pasA2TKC`af{dg>cZ;jV0xlM(mg$i{T*U;nkzxTt1>(vuFwEP_pHRF zSXNdTx??v?O#@z|dQrckI4^CADIExXUZfnaRwk8!$8HDWg+N>_UQ&@^qOyKoq=tQf zpI@LQEiikKH7P{~4hesK+Ls&Afk-*hZsQLNG%TJkgujx(yVf%unzqL1lYKC3?cO>Q zq?UYCN%>PjnOIqkE+i$VBi(@L?^m{F(elxC8V)~N3N)%ha{v;KGO8 z79`6;!2G(y+GZUueAe10fOX@(@|*87@4EV%IqTFyk|RKYno29~IiuQsYxE6i1g{Kc z)3e5XC=wb)UGa`nNch&o%NoqBG|=IA@z}}{XAWh~;RLK-)Dfpq-Y~xxN*)$UNKlxH z48Ig=)f7?9v8p1}u|Sw59GK4i9M_(8IOuV@VNOz`wC<;hE>DIBb?dRISDW)O=;uA+ ztt2ap-Ojugio2CAR$u(c3?gPWr?2RWbAgCL+ri&OZZG_R#r7A0%e5PmHTb>fa3Kqo ztNQ$j#P#QO^miC%AnB8g;<}}VyhzZvO?v1&1BD|AlKVoXaL>9YLUd3@Y%E-Mh3zN^ zO4H#J$_(~pAWXCW+yUKhiu6^XTFbo$HQqQjrHVqS`SWx(xKyYMe02j`CL5UfJxa8xp8^io5VE_Sk>R9-)NRtPyJc#YdT9cJp%o`AZo5wmlp{VG)h zMW#wgzTM+dov*xCmY;C3Fe%u-mF>_3S+a65b}O^^H23$PKu6UKaTKZ(E9EPZZ$7Q7A;mO7^@S7v`?Jn!Z z=05dA_Xo>G_gOz%*Ts8w+@!>hOmf>|)D50WyDPt1-J@k4S6BnG&^4y6C&2tO%0`US9SRb#Xhxlw$X{4fl}J(%pO!QS zAI|p|li8+{c|H+lz#Th!)33X*1mOTePpDST)peW?#(j%HUempsYml}dcAefof5XB{aCoj7nM)=caE_?Ukfq`VqxhK$m zbce@Pb+r?#Z+o-ML+xp8D7?oF$4%#rbT|nRi01~lB`$RK?MqaTeb2IuV<&41*>9+g#VDB_K=|@_Dbcn? zD;%MR;Ydr)D(Ei6dP_``rGTE^^UEYjG??JP{^qw}Rf>9s$7)7Vjca$Ll{hgZffE=7 zNN7e`$jtbKCW7->JT<^LJj9W`%1Pi z*{6H573Y~;M0zYLKn}`G2^UtLy{xHRDq%n%^T)#8d^E2OYvz20Oy-ZULcp zLz)D3(sov{(8-&yOYs0jI|fNssX{3fxy0fX47}oTq0E1x-+4MRp8SAn9B23)y>lVQnkFaWCx(bbvXMoswt%TE6cAC?7H+WqSPrO?AwacfL;|u8qbl!B6W*Kl@c9Wt) zh1CdOYQF^D(z-?!2n{@H40v}axll+mXJ*=Fl`8+zF0hpSl}&IKuN1>JBC1B!gf)Iu zAJ1jj9FJC%o6KGr5n2MCjKR7ez8euwWkgI|D^?@q51VvgY87e7CT6Kv4wN;VHs+Z7 z>x#;>&;!w&r^_em<_eIqgN1Q6;vDJo&y4F|xh4w#~w|rI}9+JGGi$M-JJ>S`HD! za31KRHHAPosw=vI zdIwzY_4f>lY<_G!_*yJ32m&&dx+vDoSP)~DN(T1bS4*ZohOfPRoquWkSvG*uH)&zF z^)cx;4h+KdXS4;${55J&v$v-*#`Ve+KH#s(-i`C9R$U~nFFW0-sQ`scR1NeMuwV%E zlkrdN4j7d*vtK*imv-6N5NpHGiP|+k`Qua^&KC_;ZrmBpg zR#PtzMKTy3g&}uBm+KWLe@15)3{Xjyy?B-%^(bxt3vLib%foY#=T_j-q7VxR@j#*Z zHSFPW#cR2?uyr{U*|f^Fhvyv2{VF1)pLh2tbLlWp7aL;BBoel4e;G)AZwr0=U5=k@F^C{Rwge7l;rQ1t|f5BayX5|%6p9Yn)gpvuE!&YeX03R)NLzzy0Vl1T66OPHf;OaM)O!Fiux%;X!1n9zDjVkfrzlPN!sWD z2NJGeN0c@~90&V3IehIm3`hkdBXvZRMIjaDNmV6I#)=rLQ^ZC&eqSa0EeHIW$vbUNCYTxY6vH z1hvVW>8EW#hUTQ`^TJ0TSv}Nz$ZwT8n|)Sf@Njc3OW^GYnzY&Kme15)$S(M%axS(% zx({l4dd&!u5OfSoc9^^Aa?O8ITC^eFfTsaZc5yErKW2+m`h!M0cV?l>{CsYB6++6| z8`XMavHgL&TJhesJe?y5 z)UcIqIEs#JSRF$-pvGNS5to&!#TZ#wJHM^ABpWZ--#=y%QAME?VTG^GuG|^~D@hzQ zcNfdAvTX|r6edSOGEG+zU85>mQ8(r}yW%MpPd5tv2n(%xV8(&y_)9f3xaeF^s4s;H z?|?#c)bE~9Cu)u2L9LN1t^X`tt_z()L)Wbwy-}#=jIP(%VHsc16`OPlef|CYt?nXr z=87^2t}>=0h}9qwC^!Yp=7(#)-stsRbMF65vFm*o=f~yrSB9nYo`ib7rNd*HZmKKK z=jSoYzW(~1Zp8i@`BC88SI3}Ttpj_%LtvKvh_>l9Y5A}WX>bG$l)0f?Yk03_t8iG* zl%F^oS_xIG9_ffSuV*mC9J5v&T`R{U(Fbq%ejmrIyzB+SrOzr>$(k?;WKFiBH$y** z>X`%5+SoOp+#PbvGay6p+dpRN6DZ{ms71{PFl}Sv;4|DE5VTkXyt|pLm#s#0%&>

xhs~uMn=&$STa@tj^@g6dI4no}(ws8)>h#i(%*NZTGZl#- zCtAiUSpI?(Kr!!4);#o6<-FKiUG;@>(@dP5jX8!qL`l}*z`caPzIqh65kbj9{w=31 z#F(jr48a$43U)JBkQrIX^CJG~tHB=Gq(0-Xv?;kJFCYV)dGM=5>X9)r>Vj0toi#ph@Y7+XQB=Jj>q9m8O~Tc{0UL9^Hi@Sr_Pe zalz)WDRlZOU&ZnIv)76BqAFzC!YCxZJ$%Cjv}0Mpx2pOsPSiXi5DzU!X!)zE;Dh85 z?w+dNHO9Njt(RS1Sy+>m=C%o-!wCx{8zdk$QrfPCIdW|UJkD-tOZ-tqa4rA`nvM~U7Hvg3@$#%;G7=dKcS$`$VH@Vk46;QokPo2_oFm>KkpTzxF0 z*wt;Ujrz$UiM^f*oO;@VrIv~S%=!=qu^SFcTM672~| z%oT8(OUcPVfnF%m^&98&4j>g{TmV`WA!|^o$S$!>H*6w8GSTKypc70%@BV#aXnu<4DCCBg1EzQG#UIq5$yYJ_yfGssXXFF!}A;+DkV6@g&c_Y$61a z@p#pTHN{?pH2TUaRq$bOs_0CN!OK{b6vTX0l%&eP*pw=%NE6Wza*EG5Wadrs#(bdQIcNmM}0kXl*MUn?ap zp~1V2tZ304+@^l@GPesl5(PA3!52?m%f!(rC&MTirA>XxUS<`2`X#Hlx}o*Oq){j( zveUobm7rgx-34)y#Y?^ysHDj7;c|**6ff_r!f9xut4nnvPwQ(V$5Dp3Dz~#cO`Up5 zQLFB2MZdc|o}aHDd(N!S(yyHQRx!F&Kl`(KaufBE z&46QR!M-i1?y3PlX8_50w%XzL<)yXqQdd;AV_s{Cq2go6@Fu}{YgT_nA;LTC%6Xu> zB$D8`cN|SaTG0Zkv{-c{+Y3r1V_G6+(Yz{!Q>Ac9Q29Y_^uw0g6ZH!93XMmfTz-aL zHJ2P~uoy5z4}Gu`KfG`Gl3)E%ciL7lZYGw3ICuIK@LsRlH;u8G5+z}GyG(ImxqY*T zgpUHQKIL5d-$C9uGS=o5W;w^PefSTOXR6^GlW$@9=wl8L{<1`b#HIoi5ch%(q+>{B zkQv++@DLI~pH&D*qd{eXp?k+Yud-`TCK95PBCk7Oj(MWeRV7e4OW=MObB8G(KAz%v z=oRAZ{t)w&l$J?noC}Fy5I%o@-{@?~*5Sr=G-EE{Y@@W!I}(6xUC*t-o>+ezFld1Qht@|E;O1X-7v) ziJ(AgIRw=bFBGl5Ec-|Lsg-Dr1KWuUMw1|iU^0e>PB? z{@yIkOd&)XkO!;ARizf7WL5615V4HP5CGJ}mVVLQLltNnFX4$|(_dJd6-L9N5vc|e zYH=Z3#Nub6%?kQAiC&?(r8o4K#I)|?&OHmIkO{s;&8tpDwid1xkftEC1_9Q41S$RS z|MYvCY#Tm|Jbk;oh7b%OM#Mo7)_Ac#DSpgfu@qPk&x+>R@hN+1c_S-cA0L>GUZ$=d z`z_{N+-Wx!+EyeDj=Pg*bqh{uIX!e4{-#aS4Ti&5B62(8j&*ThD7imf|kk z#$iUwwX{hlYTe4fH1KY~Nf4q`xnC)IGCiV|hdi#k6v6)9A@tCDx6AdV%M564>TW|2UiGSj%RLiB`3NQgrhF1=p0V^D+Iz0D* zW=!ao)>d>_gS>QL*QCo-2aVj@KUiVeQVTC#Ig?ZUf#8a*O*FO2d8_L!0RX&cQl*?K*Ea5^w5MyC!~AP}_l;4RcfgB+bI7 zR*N#NJn17$r}XcS=z5F^^Z2XZP)k|r65HQXOJ$LsD-+7gc^9Z5%p@5Tij8UM4?dzy z{xk-C!q5?F^Eer7>bQ|W$K5lerc@%{=N6T>bEH^!Ptsomxy z()0im9>SjOv2#PBYv8FmhGHnQRLTCTWL9yEo-jvghP%ca6T7f2{bO#x@)(3B;!U(e zl~yLbI27Q&ropFBKr2ZSg>l>zY)2<){s6k3)t;s5)sb*@xIRu>XDZPMH^F^zJAW|cWfI~FR4l6o#fT->zA50l`GBM2^;D7E2ET0 z-yQ_dDUKg~7#E?>#&&4-pjxY|s_16UO@p91v>?EuOWvb$7v!M;r&Q8(6R3usV@-Qw znYcJatRyEaE`<=tNtoc+P3^KG7tg8Og0cgV{VP2EZ$t)VVQshw2GSR-#kc0{eKWH~ zdec_ayW{EDf>)g1LmZEKrs;SNtKshrG-vmPKkF*kE}tj12Ct#_RUEc@?)_y9J>i&i ze$9`mSTr+j68quvozzU>^O`IvBEoLQlli~ORSDXj#$i0Pp7hW<|tzY_mQ7zlC zE-yOwIU9VXZM@!sR^Gn($`%t;$bbl745oqxE#Rv55{Be)O zI~?Vwb4cZd$ixI7jOVp@$mU%mj+xe32n?24cD(jKKt2B^f&$x(i(P)^l6~+U?7>aR z4Se3*UVRcdHN}LBu(#fe+>6X|r7a#f`?ZMpi%zduL|_%k4&kB`UzD$#CM6(tnqAF- zklq7n!52pJ(So_=sEE^}gQ?Z|Z^T`+0XX1|j1kqQR+(2T?a}O5cPu;l9c!FxG(f#U zSq=8Ewd*YDJ@=Xi^9kpagevpG&z;XBq!mo5CZyE!Y%wmBfv(W-KM&MayZ>A+V|(1i`n?_W>g_6d?v

    ~8;!E4#(Z_d++vY%E*% zQasc`w9Zn`s@;6ZYu|MoK<-i?C*q6mm+AzE3TI=bc2WKr9Qa% zIOP+l`@1Y2H&hg9Jo@^TGR{g&5Y*)15_%+ama~<0xA2eJv}Fr*)JTb=O%l7CDsF3V z08I0NxlDi4m{Z6Lj8WF5)2sO`dIc^kPcVM%<#yidR{RFZm7@=;kIEbtMl5FY4C3v;b*V;wndky9CnFL1vRhuYV~=r_K7G{+1Av>)2~ftf_t64#>C z<(nm0!=eM39;3ttDS{+ykKGZpnNn#g@?+}>0UXlFr*ZVM$-zkle<0T1e`K$2SZMw* zTzV3wq|Rnbs+$&M2F=;Y4U>h#h#;W`-1eq~xhpN!TGi18kqI3^f;A_8GpPyrPXH59 zW8Ya5RLwRdAzgaH#wPNyG4c6M<;p|uW-&iz{voZ68zQT*d5Jw4Wyo%;Cc~s10smX* zFnX0#nlIzb-ta1F+@twwOY-w7x>dN0O^hrLgCk?{bF#A8sr)R5u=w@EbP^#*UX1x1 ze&zcI=;+CWEm92@A&d#v5s@D!Z=wccBSWvRyTg}&>$IAsD8jAKZ)v)ZaVvUEoCs{kr*8R6o6k|~zOiL1rYQ0vp*qxN?W*O{(j5$@3V6XKj zGL)dZQkobsf;e7GBNEiNfI*+}1OUYIi{7x{g!V{a9Aq9zk4XwQYbdo4WtC)+%)K(W zavDlv8>^UUYDqV{Eyyc2sF*bv@M?q(Y9WCK@3HI@3y@e#~098V)o z9k~w+kOlA_MVKfq_)bY`%5pFt6vQ586-fYR?4pZT8BZ@mti}lBVbZ;cBfT9d*X1?tjqDk$BV5O(a#}~w5j~C`a$kIvdfg}nO@$5wD|W;W{dAiPo^V|F^d`&M&L zftsB4VU%dWnxL2&1i1x$&Q5Q^>3ARM`lMmwks3~tyuiwm9_gGj>KFs&I~0?~WR96v zJTz%U#K#4UXAoiGHUO0y1%$j@huA2(-usx)A#Q3vU*){SO+O_%yMb{r7Zm$F-$Uz$#xm- zvtb+K%P#k8Zi6hyNUxIuGk6)Zt)}L=YQ`MqJO6jxPRzP7{WGyD6wHb8&GmD!WH+V5 zD+{3qUbDhnPY{%%Um4txDl9d`rciwJ{xgY0)sxCbdikQNmLkLVqY#xMl&USDs!#Bq z+1u$t(e(~<+U*BY8fH$ad|Y1Ot0BvmWiPwTcft!}m1@;g9zK`zkewJMApO~Hcc+-i zZ3gex3VFRP5*GA#xR2w+tqj`>DYtsAR&I+|6zjU1sTPQQE6Kfo+uX$tY`)tF)l{u( z7U{~Dq97$=*9yh5v`)CxOW5G_S)1Q)Tw5W_s+zvr6G2nJNTYdX8 zva&viJMO;pq8%C)w1I{nR-M6*rb$X z!9;#bX;ZJyHP)+J8`YpmDFqfU4aAD!H|s*`;aug*1mRDfJ?u7Wcbm0<516&!(vtA} z=oViw2W**ZC8{edT#RE-Unc}(B&gaDUGj1(*M;0LSQXd>6kEcIJf7Fiq&})sL^$^R z(IC@UuF)^w5~;1bG4s%wPbg$o*^DSTE2lqkFvIP{MAehgZ}yB4o-keAOR;IKTW<-~ zYf|qn7m6%)_V$uiOZe3^q*SjfFCW2@bflrvM#{eZ7fxMIC(&8KIB|qQ$?M6YcKO=D ztGzQ;b#pY=~X~C2bjQ3CLUccN*R0Est9oEc`eIrl&9_()A-+xU^t^zg!%yptL6f zUG^ssS~#NM_9xKs2|wpms%|1_i@Gm>(c|%};YWaHEy36JhT@%+lls_ z;iNn4qVNLV8XYX{t=>#DEur2_{_%m6ITus>8s7UcsLwa*)CWcQ9ZNvlmNZ!Y z(K_4a;*iR{D4BH~jIgQ{?fl;dtH$Zl3no2L3m+^S{b(1X?r|h&vB8ggOk6pak+BZ_ z=)v%I#w2tDV_c%*)jRCbutdX5ohgKskr81aOb#_ek5-!YFWQw-* z!e<@!ow>!P((BUo%aw^(LC&k<%P}uV7HOjfdag~8Z@{3@qgeb({+lvg)64U8uEQFO zJ$%m*ZpEI~W;Y~Q&2CWa+{90d^yHJ0(1b(>>jQ+cma&PPj#z0*IVj~dM^yx|pewja z)dgJ2VO6DnRHh|kIP{PsJCvpwf-Z1;9Co0(wKLdf&rK+B(3Tn~Glk>;L^ z$z)Zs22;D3)PrH~+FucNCG87cbG@X0Xm^q{Hdg+uhm{Y-vbhDciRSdPm-f{eUAVz?2m+kg&hN zne+1L6X*-#M!0j+7YhfKH$q{G>55Dw!+?Rd+J4!7nH)s|t&mRZvC1OLVlB+O<5?dc z5fJV~zxj6hux$B%%uuI&QqxF_O@|q`(fNWlAT8V%d zzW4=I01ogSJ_Zn3Dv{)HyU15uLRPM%;~wV+_7`vmG3GIn(yr?;Rpd36BCc>CQp&1c z)Ko?W&PoGVk}Kji@CapxXN_&k<1?X?G*(cm(pK{_EoMf>49VK~!-2{XeZm$pb4|ya zlRxk9mXo-|5;m(6v&5jq6;KZRdpU5XPA#O&jLeruQ;F=BY|X)M#>7t<6UI=xGJPGO z8n$L&x49a2pHY|Xvs&Sd)b?bNU@A!8Z^pzOhgNV+EE|-nBxL++A#>O9vN!Ad)&*Dj zxZKJd58r^Ef=G#FU!xgIvuSGb#if4u|5SDsP*HW=!>2(&x^w7;p}RqlZj|os8KgtH zLrO|YME;{s@rn0&|La}f+?h4^?z8VbvG>_$*1dCnC!I)UMC|;B z9Pl*=e9ZnKc4a-nturf^Nso(j=oDhC_w7@;=bN#r@<{Q~lN8A(zRSq@7e2XKMNhoC z2(#y(nbLPCd3ItCtL@mCb(&|7Zn9I0H?*^Ny>xqGePk@@wZ289l#K~u(uoWfcr*PT z-B|?h;pN~XMtLh#x-X9y9ug-ajYH5i5sG!5?!MYy#4knd8H|9zX^n5SHqC%35{})$ru8}yokAseHq%@<+)f!|$EG(S zbd0f;!gwjXIqDTk&nHD&A z-FF|%Q`L)$yl~%}Tv(r@w0`(C(7)NcIdjZ`f7Fg04;MImUa`SqpRdvE4n5I3;yZ0y6^j8B z2~kDo#qi3pNMP!&UimaIv>?4y^Oj9-bKQ_jlsicrf5!Y zPH2pu9~6g5K$eMr)TwHN>N0YCp*|Ffm@8Er_Gs@O=?(-t$eq;1=-z1VODUpnpA;6r zZaI?hyV2pqkw3A*d00tW%IuEY1@;EKqGY+Y*~zOeVbK%<-d$6bdw@d2lU zcvbbEo+c{pcrTQl1%1m{Y+S8aK%Ii!Ffw$7PJ+kyyB;oH45iD^4EbM=n+CdHZqqqC z?<~8Hb-`}g4Y-nJn*_RCKTV9EMudxmJXh}j2)LTqHqCaP1d@vbISb6)y6Wl zk_+Ob7y|tTU7Fic2whm!zbcrbg!uGu^bc86kanKt2gHTP?>Dz3}E+X z57<&mw0EaGNh9kW6PnS8zKK?9veRTp=wfwe)`=dpA!Cc_&la4R4~zH#qE2g>?pCS%hz|Ed`V|?fug}9{$Fskkh_W%!`eeiYanVSZYLGc;<#TnOeu|L^B4)Vw9IOH=L&XVhp=&0^Q)`iv`_#StRG ze)!D&VT+;>2Z+X<(!|!+^jcAegPRMhDL+8^oLKtXxCk_u3unWI=;gRbRiA^~b^v25 zcfLv2hh@F@mt7O<6M8lMYeyG|B`>pDa-GweJv(UGpi=r$f}0h$7zU+;;~!p{U?+aq zYj{H*7DC#g-scH9rA$!S6{X1Mt9@1F6&x%loyV$whAp(IXP8*|h|j#+slq^Bzirpc z8t?W=shU}J;PajQ`0y;tnS_wmdJOC2A=VgkVY3u&I#j8c0rTz%?oexb)Y6x++>Fu$ z4DbZlOcyKNJdf{qbJa9Dk=*L(r8CA7H!`(-{Z)dDhB3F8qFY~$2{W4aQ8KSs0CS|5 zortWV1Ku_~AoilI!KLH@yNEu_Y9gkChw4auC}C?cW3cHiW`2N*GMq?B#wB~6fLTe9 zYa!38Sa2pg5!_1D?ygQ?(Q7GfWSpn-CBFVzA{r08Qksgy)KAuzX;y{=Rm8JCTkB}& z%O5TiX!s+#n~}R+5=e<@ghZ#{F;|J;WyV63UYE&Tg2o5PKEr;K((%?r>4pmw3wrj;MH4zc(CgZkSHC0qY_SIRO z%bYyTJ(RpZbej?^CpXPh<22vwYc6xWb1Qd{>U!tl>zVN&Xdz56>r*y~b^@-3y-l4F zM7pcIWefaPo;ubda$a3x9Z!!;w^%mBw1-p`0_>l8l5XjVY1`pjG$!?`iJBKfHk+)gITseC^Ve zig^C%`Z$`<;$rpi;Bf34J}t$XUsC5T-@BTM1_|VAo3o)JJfgSK%a`xSG<9S;ADgig z1jK=Q2+9_FK3RsA6jNwM%lt`|DAgJcriU2~34^(drg#(E~+=9j9MQ$)PMFpZ= z;ZhLTQq~Ig*C-;lj|pzM8pSURTICfDH7A^jIU`3{D+XpbxL=MAtj1mvjty7WOzBgM zq{$--q<}@{+d5cpkH;Uz9}{%A0f0(Icn85vrVql`&pG`mO4V;^6`Mw3 zw|p3TWEH;ZM?o1|ufZO^PjWcN5ey%amwJ5KO;z+IHkY5@eryUX z>LdP1o_de>eg{>sT!eX_PTv8gFY)>L+6m)jLx ziL}QyaQDjYSwLlgo&@NETAdv&ob2IsLvKSlk5?ew)2c*#^bXp?lP4NsF9~t3lF?Wm zRbvb;>w4q2zz5+j4mLrrA0>Piu2;RiGMk^HxcI>KHIb_WF{!mG%QgIZW#;brp4K+Em;mWYzlm@b1Y`>)rc1$yeu+Hv`{|afdPKJldI`n3>ooQ`$#yksku! z!FDK5VUJ&}q{OsnubR}WmCE#-CeMrsiI zBFVeKE%eE(L0Q4nz8J=F5u^Jf?gR?r8^%u0ufRpp-mYEs*fcxRwZr1bZ;-~L1A||6 z>#ynpggpzpRM450`>xg*mz}%6L)I6NBQ5drRQ4})R$4_+Js1% z;?nW0BKVVPvBlAbgAgb?#p|fmSH91n>ekPE$#EW%IP-hidGZe1Kq%r>q-{>OFf*dM z+4l4>C~)tnxH~huAkuK8c9i=V!>*?hxJO-}?j}yrYTeSilVvD));qVK&p}2YRkk5h zP35dGRj@sJwp7*YwJP=~=WjJ~t2iq*)F3wFOV9Jj8Di-gOg@6}((}CeR$D=`{OqzA<9ic1jQ$eFDCH^TCFAuo12XN`7vifc0 z=d^LB^COE2Be5HVPCc4A5me3aCz3Yn+{SL#97n82oPKPLiK0af&Zp-))V7kjvlvxP z7zZD)A01r*(Sy3N>|_y4qa`EnkPRe0Q&_~8_C`!2%(j29xtf`s@te+=$pdxHS9_nfO%1@T_R*1xLfqLYapANfF{R3PR6e_c=Uag)_<5kS$o!3gI48oP(!Ur)ZP zkSN`nCtIFBH7dG6-vl2PI^Z<42QsH zHolNRNYMKHQ@-t2^FY7-9oQ_kJVVi1#IMxS_NmX)_agKbP=(F`6pN@TVj(6nlCrH3 z=Nq;SO4c^-2jDH=oRC_*j@Sb{q4ZUZK7SEoFMmRfO!Sh7`7CP>@7 ztjAgy=^Vfx&>kQ!O^g3%{|sLgy3lhQ9Pm6K{zfoiS<5Kh@%yBQVZC9Ue!Z!it4Cfj zLb1yPVKJ%>*E{j8Cg1o=r4zx4#1cu3Jb3f6jMmiF=%Dz#F}Yxa893E=?$pR0c~?NN+e#Gm86vu;_KWt~T!ooEF!&S`+i+eVqMaur6%W zgQDZS^KC)E-3!%%Utgl zKFQ7D&DMKQ$05FOt#5sUFOXlC}n7ORg6kenbuI< z+q(3e>^ROLZg@FGUFiFxFJkCeqH}%g7+_Grl|&)%Lb)2{qml_d-&SlNUh+CX>Nv+e zHE)U=?VKgE#jHan*&!f@mRBHbF*!puyA%8}wFk&fBeyJ_;4SmSh`z&lIkY}K z76c;hnOjJWH9XI{Mi)eoyg?bc8mp3|ConvLY+VfE;H|SI z$%5?)OkPL4F5wp-rm2Ui_HY~flx|Ma)^(Qx4Tb?vPS~pPL~&QvW>De!gF-psUmbBB z_zOBjo|Y*re$yIpAZ0$n89zVWv3T9)PJYQ^Goog9Ode?k4Z!hA(c^)s+uR>?@^|7OR)ed{O`K?oflyZZ1wZ(Z)OO!l(@+-=hirNxy zA1uOy{kVO}H#CuyH?ZBV;l4_mo?w3Mzag`;wci`6pOStST_i$Mp!_`cmDE(eWP2Tl z4U^{B4*GiAoXj&Ep>e;Bj{3;&2v4N+2aF%4o<(j*n;=))B3sRvfDdOX4Gqm@}AunIat{z zyr$X?#a5(oTJr_rh=<-_vz*unA}+O*?GZ-BX|P&HTOH1esH}nQo$=^skN)n5ZplqLag6p%jG7aTt#> zB!*6oh_@Uxu$?}`2Dk5GZG;VJe7BYn3CBVcf%Alm7nKb~{RonPz^(4Mh_(UZ;mP>- zusl5z8)b}NBdy1~3$ki3|j5`K7aswKN2Twlw zHNJ7?sh?}~sugRw>u3RiLg{z*BmLrdlfx1Hh&w{}!)SJ%GIUnGG`za%{!w-ym3bU` z{QgU0zsx+n^^%g05xD_QgT;Ot;(^7SU(cw#Xb9@O%eVLStG)9a`^NLc=m{BO1-Gik z3y1>MXN8pNbp$h>fodY2QhfRmC&8mS$j1?&=lw(Lno&ed8pHCoJ%lMl;puQ9asAoW zc8^WbF2|KZ`VB@G2Fo-s18+zi;$5GgRX7{X3V+mh3I!*RQ|rq^lbX)xkoG0{0uHZJ zRVN5M<*w0ilg#>_%7tE&iM>)un3}>zS{7Z7mm&W?wk;##Igi9kMMhv9Wp=K?6}Jny z{ltzL2vjuLr^H8ZAmHp);M^%O>fg{BmVRxOa zyfV?T(DsgT)O@$4(3wK?Hgcz*^+LQvMx6M=eoEsRmQK$n+Y=xCwCl+#A6+zk9<#Qd zdbGIl7dZI~uz$`dB3$bxhZpQQBgUE+OOyAMpu z`;O(-6*i$gxQ!3FAET%1c~{y;Q#6#jj5Nk|wPXXn%Tj|H^;PG-qw?>$UI8XY$9_-k zfW+3mdgY6a`a)c;G&m*Ae#8-L;eM#6E?FkMmiN zA%pZO?>DPkdDH;WS4FC}GA|gNB{r+rmZ92r_6)DiW&#$y&gN%l8)wB|kKIz$%)MWb z>=b$W{RS}^dbpQoB61=X-Ez`SBCsORKq_lUTTrcpz3>uiB$U!PmsgFw6We8v?hYAeb0mNZiZ zAs`#|Zl{6t%hsic%rlWb;?TTd(K|nJV;lSY?bpPQb02Km;y=B_WPkk7iXp#Bs}jS9 z^i*K2+!{Ms@7zDQDGtmFCi5=2&|L{OC5f`yrkOGzQ>3Xw5+Jp1k;?jxv24n~&wjGL zvKY!J9vvUrC!(Nu3y5ibo+OB`rIRo^r`DSmL}L}NrnqSiCxFZ*bg~#$8u~Jmj4|;rKAPr-bC|7@$TBWY46j#xZ$*U*i9h!$yo47E_Y9HCd7`{0 z#kRoTtUe%@MKe1i$>UV?g~X_fGYRR6oYXpq;V3v-sKA|e0u2+;jPx?YW=1qq5la0M z#jhOCNk{L(wyyn2(UW0P~3Og+>~&>BB8e&Y7ROu{eQA5L+{ zSs*bpm^tlz)8|%_)PnpSla)583XSCs#}1_S;{*6p8;C`7&@sCo;_1|k<4dF;7UiTK z1=NHsp94eRq=j^KM(2E!gXE8>hdkcFRkES^9*k^;3rr3?b>zW}gVTnU6F``cVz){r z(9SqAGOY{=+_|o{U%*;FLo`(8_MWY3q z86&T037e_is#D>{r?mqH>+Bfm^0C5KV$si(J9x}yT!o!TtJZ!*hATa|3Jz$^72f@J zYT(zl;3CMTCqg9F=6kM*wTXlh+!_2JfFiObXC47sA-J*;W0=pBge1%aj66n|j>|y| z#5xzw_mtWOT=$eybbP_04}P9A;W1ii6miAnf$4>{<_zlQ#MJOIKdf8P>aIqh`(DSSfgEP`vFu|@b%L;u7DN70?OYEc^F)tBa zq6!NO>&VWaY^$q~z?#J|(-}z@2klug=QQV4$fKIku_rS^F2$OTV*F|RtBmrxVoGm# zT*&Nm9mHpSKTHs~?z&RCSf5-UAw|?qa!v0p`*%F4nl#-;?QI5xI z8*`h+nfr7i#=et-(uQX*a)E3AMdE{IgnFZhWuz$vHIT8;b%WdCKCQbM954ms#Kegu z>NA*x4un&IXPgW_Q~RWy1-+^R=UgBhPLvQ-`toy1%ZCftjJcpMsE)VXz;FuV>|zHCVn zMBBtD1^4CEuO%8EMsi-TWcX(&w@h~RmvvUKDYJ!FgC0{8Rm%6Cz&Fy~Nhf=;dNk9b zT3_+xCic0HY-8e;WaMEY-5tC8 zY~3C2lNu1!n?ad7oey&<5#^|wc|dCH=LbYC=CN%mGY0O6J^4EnXrR)*6>Bxax=!Hre-X_TWlFO`YO>VDzuV2NNA zs7!rV7~?5N7|%&V)s8$)L&k+)YnENG5NW{9<-^H*_eJgKAyb}4f9^-}1)lN|J<*zWV$mipr<95z*C|4C`bM9;{RN83b>xiw!6 zTsUezRDGx>oXPowFpF;3GH6pZts>C($mzPC9-b?&<3n{+W@#7n&Ewa(FV6Xkc5?I! z1s?Py1R}>OjV9ns#1_&wh`*llD;k@8lYOkZ@Xp2YF3>jXizzf|>`mxcs?$Cc?prY; zgMYKQEOVLgfyRUH1&G*P7JS9Sr!VZ(>u>E*KhaTdqg0WxGM4PSu)DA~eE4YYc+Nwc zoU(W+>G6Ys+MkPaf;!DbS*om%R7ll5UyQK3KaH6mk2cfz1Pd zYK%J4l6XGKkSv&dRLH#+(;XzZ0wG`i%j~MRnu=o1(0XF)bNgQ$Jj%dmcztnI_y~JcBMnS;*6aPRnZ8llz4V+sL14@5P6+Hk#e{#Ou+n z1I0|UIV3-#gs8~y-I;H?-|}pSY9<{eI~~+t6Jy*C>zi%_xj{1bjv9S0ZeE0uZ6e}b zB6=NlxM})IJn8j*`4H^2Z#~dFH&+%h_FAyAnRlLi z1A~12`Ciml$^`3#hl284(ViO?rI6)*9f1Kh$CZi%c`nNYk0Wj>w$GOGKBbn=Eot?R zAvp1$ydci>&>%`#w?~!VSG^Te>nsW;hMsRj+bgI#yeA>*?6DTX?kF9^J#AOok1YcH0a+th+iuHlTYNqY&wIq%`c5p3vZjLmu`f&3 zup}dYulyHLW-1*OA1`diW|J2Kfl)CQ*ii~erXqU}R3T^$2%<Ty&*1J>rh`P?Ux03mBF$lw zD{_nwOZFm{=-9{jMMfx%BJwMZuG-6L3Ge2ohgjR!Yz*~Cwnx;9**y=RXZMo{Hv1iF zqhc#L<~GS=yQ2{|Ux^5=ID9y|Mx+bVeOp{y9ZB`gei@X zXO!F{k(BhgHS#6@=y%*;MuV-^ToDpg*k1%P;h& zE&q*Ry+^vrS%WM+?A^ew9{04cgBK_O-~j@G0B#^` z?-$hdG1%P6-cs7d*2x+G{CS!*$nqWvDulwtAo)+Mruv#4$ zBKAJ*r&PuqWc`zd`#;*q-U9@dwKMnl-3%(`e;jl3{M9ahwLgIS-a=YGvygXzErVn4 zVhiBXw|9|t0onf+{7luiw*uRN{y^j2chUc!jYsgF(EERHyu9~>+y8qLss*#&U_g$e#&5sSaS0;>avgQ#H>6dn}3#;LmhrG_9H zQxV7wU^Lr-3%PrXw7mQD6OtE~aiK=YvvmlbZdP-g{rzKYA}WH{YMV z_cz!1Z%Zg~|4Co|pKB=bj_~s60)smVWI5p|LlFu8OTNFm0W1n|{}nXuedV8>;~$L&=#!0F!PylB5Sx!g)rUg=;h)gjgoBgr~A`aKiD4zxA(Mwy)MN_1+g+W!enqzV&aD{cM~+T!n{6;J2&%+hqSGp21=n@6T9r zpY|&@0|6XBP5~a6SvUoF1qAQS@hdXltEifTJ?!sk-1nnDO!L>*uk_y@0pR|V{mpeB z_cT0QEkQ76g6aQ?YLEVmh`$2#Ujgwy!Wt(mJP7f^;tVGb|F1m$MtB0fF~j z^`A0cVF6fC{~_ZM;1Pnwu7An|`R{|&KV(2YUS3$+{zJyY&wuZ3|0#n-{Qr~*3&Gs) zpLzm7;J@?)1o{56%sc|Zyf9b%k8_0u{#6!1ApgJg1cCp~SCE_U-}U7d`uDkfFnIlc z utils.Bunch: + return datasets.load_digits() + + +@config.when(data_loader="digits") +def data__digits() -> utils.Bunch: + return datasets.load_digits() + + +def target(data: utils.Bunch) -> np.ndarray: + return data.target + + +def target_names(data: utils.Bunch) -> np.ndarray: + return data.target_names + + +def feature_matrix(data: utils.Bunch) -> np.ndarray: + return data.data diff --git a/examples/materialization/model_training.py b/examples/materialization/model_training.py new file mode 100644 index 000000000..894cc533a --- /dev/null +++ b/examples/materialization/model_training.py @@ -0,0 +1,96 @@ +from typing import Dict + +import numpy as np +from sklearn import base, linear_model, metrics, svm +from sklearn.model_selection import train_test_split + +from hamilton import function_modifiers + + +@function_modifiers.config.when(clf="svm") +def prefit_clf__svm(gamma: float = 0.001) -> base.ClassifierMixin: + """Returns an unfitted SVM classifier object. + + :param gamma: ... + :return: + """ + return svm.SVC(gamma=gamma) + + +@function_modifiers.config.when(clf="logistic") +def prefit_clf__logreg(penalty: str) -> base.ClassifierMixin: + """Returns an unfitted Logistic Regression classifier object. + + :param penalty: + :return: + """ + return linear_model.LogisticRegression(penalty) + + +@function_modifiers.extract_fields( + {"X_train": np.ndarray, "X_test": np.ndarray, "y_train": np.ndarray, "y_test": np.ndarray} +) +def train_test_split_func( + feature_matrix: np.ndarray, + target: np.ndarray, + test_size_fraction: float, + shuffle_train_test_split: bool, +) -> Dict[str, np.ndarray]: + """Function that creates the training & test splits. + + It this then extracted out into constituent components and used downstream. + + :param feature_matrix: + :param target: + :param test_size_fraction: + :param shuffle_train_test_split: + :return: + """ + X_train, X_test, y_train, y_test = train_test_split( + feature_matrix, target, test_size=test_size_fraction, shuffle=shuffle_train_test_split + ) + return {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test} + + +def y_test_with_labels(y_test: np.ndarray, target_names: np.ndarray) -> np.ndarray: + """Adds labels to the target output.""" + return np.array([target_names[idx] for idx in y_test]) + + +def fit_clf( + prefit_clf: base.ClassifierMixin, X_train: np.ndarray, y_train: np.ndarray +) -> base.ClassifierMixin: + """Calls fit on the classifier object; it mutates it.""" + prefit_clf.fit(X_train, y_train) + return prefit_clf + + +def predicted_output(fit_clf: base.ClassifierMixin, X_test: np.ndarray) -> np.ndarray: + """Exercised the fit classifier to perform a prediction.""" + return fit_clf.predict(X_test) + + +def predicted_output_with_labels( + predicted_output: np.ndarray, target_names: np.ndarray +) -> np.ndarray: + """Replaces the predictions with the desired labels.""" + return np.array([target_names[idx] for idx in predicted_output]) + + +def classification_report( + predicted_output_with_labels: np.ndarray, y_test_with_labels: np.ndarray +) -> str: + """Returns a classification report.""" + return metrics.classification_report(y_test_with_labels, predicted_output_with_labels) + + +def confusion_matrix( + predicted_output_with_labels: np.ndarray, y_test_with_labels: np.ndarray +) -> str: + """Returns a confusion matrix report.""" + return metrics.confusion_matrix(y_test_with_labels, predicted_output_with_labels) + + +def model_parameters(fit_clf: base.ClassifierMixin) -> dict: + """Returns a dictionary of model parameters.""" + return fit_clf.get_params() diff --git a/examples/materialization/notebook.ipynb b/examples/materialization/notebook.ipynb new file mode 100644 index 000000000..0a414930a --- /dev/null +++ b/examples/materialization/notebook.ipynb @@ -0,0 +1,535 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 8, + "id": "7bf6a40d", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "\n", + "import data_loaders\n", + "import model_training\n", + "\n", + "from hamilton import base, driver\n", + "from hamilton.io.materialization import to\n", + "import pandas as pd\n", + "\n", + "import custom_materializers" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7a449245", + "metadata": {}, + "outputs": [], + "source": [ + "dag_config = {\n", + " \"test_size_fraction\": 0.5,\n", + " \"shuffle_train_test_split\": True,\n", + " \"data_loader\" : \"iris\",\n", + " \"clf\" : \"logistic\",\n", + " \"penalty\" : \"l2\"\n", + "}\n", + "dr = (\n", + " driver.Builder()\n", + " .with_adapter(base.DefaultAdapter())\n", + " .with_config(dag_config)\n", + " .with_modules(data_loaders, model_training)\n", + " .build()\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "397b09bc", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "predicted_output_with_labels\n", + "\n", + "predicted_output_with_labels\n", + "\n", + "\n", + "\n", + "predicted_output_with_labels_to_csv\n", + "\n", + "predicted_output_with_labels_to_csv\n", + "\n", + "\n", + "\n", + "predicted_output_with_labels->predicted_output_with_labels_to_csv\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "classification_report\n", + "\n", + "classification_report\n", + "\n", + "\n", + "\n", + "predicted_output_with_labels->classification_report\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "train_test_split_func\n", + "\n", + "train_test_split_func\n", + "\n", + "\n", + "\n", + "y_test\n", + "\n", + "y_test\n", + "\n", + "\n", + "\n", + "train_test_split_func->y_test\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "X_train\n", + "\n", + "X_train\n", + "\n", + "\n", + "\n", + "train_test_split_func->X_train\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "X_test\n", + "\n", + "X_test\n", + "\n", + "\n", + "\n", + "train_test_split_func->X_test\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "y_train\n", + "\n", + "y_train\n", + "\n", + "\n", + "\n", + "train_test_split_func->y_train\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "fit_clf\n", + "\n", + "fit_clf\n", + "\n", + "\n", + "\n", + "predicted_output\n", + "\n", + "predicted_output\n", + "\n", + "\n", + "\n", + "fit_clf->predicted_output\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "clf_to_pickle\n", + "\n", + "clf_to_pickle\n", + "\n", + "\n", + "\n", + "fit_clf->clf_to_pickle\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "model_parameters\n", + "\n", + "model_parameters\n", + "\n", + "\n", + "\n", + "fit_clf->model_parameters\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "data\n", + "\n", + "data\n", + "\n", + "\n", + "\n", + "target_names\n", + "\n", + "target_names\n", + "\n", + "\n", + "\n", + "data->target_names\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "feature_matrix\n", + "\n", + "feature_matrix\n", + "\n", + "\n", + "\n", + "data->feature_matrix\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "target\n", + "\n", + "target\n", + "\n", + "\n", + "\n", + "data->target\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "penalty\n", + "\n", + "Input: penalty\n", + "\n", + "\n", + "\n", + "prefit_clf\n", + "\n", + "prefit_clf\n", + "\n", + "\n", + "\n", + "penalty->prefit_clf\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "predicted_output->predicted_output_with_labels\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "shuffle_train_test_split\n", + "\n", + "Input: shuffle_train_test_split\n", + "\n", + "\n", + "\n", + "shuffle_train_test_split->train_test_split_func\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "target_names->predicted_output_with_labels\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "y_test_with_labels\n", + "\n", + "y_test_with_labels\n", + "\n", + "\n", + "\n", + "target_names->y_test_with_labels\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "y_test->y_test_with_labels\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "X_train->fit_clf\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "model_params_to_json\n", + "\n", + "model_params_to_json\n", + "\n", + "\n", + "\n", + "feature_matrix->train_test_split_func\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "classification_report_to_txt\n", + "\n", + "classification_report_to_txt\n", + "\n", + "\n", + "\n", + "classification_report->classification_report_to_txt\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "prefit_clf->fit_clf\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "X_test->predicted_output\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "y_test_with_labels->classification_report\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "test_size_fraction\n", + "\n", + "Input: test_size_fraction\n", + "\n", + "\n", + "\n", + "test_size_fraction->train_test_split_func\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "model_parameters->model_params_to_json\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "y_train->fit_clf\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "target->train_test_split_func\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "materializers = [\n", + " to.json(\n", + " dependencies=[\"model_parameters\"],\n", + " id=\"model_params_to_json\",\n", + " path=\"./data/params.json\"\n", + " ),\n", + " # classification report to .txt file\n", + " to.file(\n", + " dependencies=[\"classification_report\"],\n", + " id=\"classification_report_to_txt\",\n", + " path=\"./data/classification_report.txt\",\n", + " ),\n", + " # materialize the model to a pickle file\n", + " to.pickle(\n", + " dependencies=[\"fit_clf\"], id=\"clf_to_pickle\", path=\"./data/clf.pkl\"\n", + " ),\n", + " # materialize the predictions we made to a csv file\n", + " to.csv(\n", + " dependencies=[\"predicted_output_with_labels\"],\n", + " id=\"predicted_output_with_labels_to_csv\",\n", + " path=\"./data/predicted_output_with_labels.csv\",\n", + " ),\n", + " ]\n", + "\n", + "dr.visualize_materialization(\n", + " *materializers,\n", + " additional_vars=[\"classification_report\"],\n", + " output_file_path=None,\n", + " render_kwargs={},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f5727b54", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/elijahbenizzy/.pyenv/versions/3.9.10/envs/hamilton/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + } + ], + "source": [ + "materialization_results, additional_vars = dr.materialize(\n", + " # materialize model parameters to json\n", + " *materializers,\n", + " additional_vars=[\"classification_report\"],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8bdfde70", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 94\n", + " 1 0.91 0.93 0.92 85\n", + " 2 0.97 0.99 0.98 96\n", + " 3 0.99 0.97 0.98 93\n", + " 4 0.99 0.92 0.95 88\n", + " 5 0.95 0.95 0.95 85\n", + " 6 0.99 0.97 0.98 97\n", + " 7 0.97 0.97 0.97 89\n", + " 8 0.88 0.88 0.88 82\n", + " 9 0.91 0.97 0.94 90\n", + "\n", + " accuracy 0.96 899\n", + " macro avg 0.95 0.95 0.95 899\n", + "weighted avg 0.96 0.96 0.96 899\n", + "\n" + ] + } + ], + "source": [ + "print(additional_vars['classification_report'])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "a6f5fe83", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 94\n", + " 1 0.91 0.93 0.92 85\n", + " 2 0.97 0.99 0.98 96\n", + " 3 0.99 0.97 0.98 93\n", + " 4 0.99 0.92 0.95 88\n", + " 5 0.95 0.95 0.95 85\n", + " 6 0.99 0.97 0.98 97\n", + " 7 0.97 0.97 0.97 89\n", + " 8 0.88 0.88 0.88 82\n", + " 9 0.91 0.97 0.94 90\n", + "\n", + " accuracy 0.96 899\n", + " macro avg 0.95 0.95 0.95 899\n", + "weighted avg 0.96 0.96 0.96 899\n", + "\n" + ] + } + ], + "source": [ + "print(open((materialization_results['classification_report_to_txt']['path'])).read())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/materialization/requirements.txt b/examples/materialization/requirements.txt new file mode 100644 index 000000000..3f69ad5c2 --- /dev/null +++ b/examples/materialization/requirements.txt @@ -0,0 +1,2 @@ +scikit-learn +sf-hamilton diff --git a/examples/materialization/run.py b/examples/materialization/run.py new file mode 100644 index 000000000..fce1faee7 --- /dev/null +++ b/examples/materialization/run.py @@ -0,0 +1,87 @@ +""" +Example script showing how one might setup a generic model training pipeline that is quickly configurable. +""" + +import importlib + +# Required import to register adapters +import os + +import data_loaders +import model_training + +from hamilton import base, driver +from hamilton.io.materialization import to + +# This has to be imported, but the linter doesn't like it cause its unused +# We just need to import it to register the materializers +importlib.import_module("custom_materializers") + + +def get_model_config(model_type: str) -> dict: + """Returns model type specific configuration""" + if model_type == "svm": + return {"clf": "svm", "gamma": 0.001} + elif model_type == "logistic": + return {"clf": "logistic", "penalty": "l2"} + else: + raise ValueError(f"Unsupported model {model_type}.") + + +if __name__ == "__main__": + import sys + + if len(sys.argv) < 3: + print("Error: required arguments are [iris|digits] [svm|logistic]") + sys.exit(1) + _data_set = sys.argv[1] # the data set to load + _model_type = sys.argv[2] # the model type to fit and evaluate with + + dag_config = { + "test_size_fraction": 0.5, + "shuffle_train_test_split": True, + } + if not os.path.exists("data"): + os.mkdir("data") + # augment config + dag_config.update(get_model_config(_model_type)) + dag_config["data_loader"] = _data_set + dr = ( + driver.Builder() + .with_adapter(base.DefaultAdapter()) + .with_config(dag_config) + .with_modules(data_loaders, model_training) + .build() + ) + materializers = [ + to.json( + dependencies=["model_parameters"], id="model_params_to_json", path="./data/params.json" + ), + # classification report to .txt file + to.file( + dependencies=["classification_report"], + id="classification_report_to_txt", + path="./data/classification_report.txt", + ), + # materialize the model to a pickle file + to.pickle(dependencies=["fit_clf"], id="clf_to_pickle", path="./data/clf.pkl"), + # materialize the predictions we made to a csv file + to.csv( + dependencies=["predicted_output_with_labels"], + id="predicted_output_with_labels_to_csv", + path="./data/predicted_output_with_labels.csv", + ), + ] + dr.visualize_materialization( + *materializers, + additional_vars=["classification_report"], + output_file_path="./dag", + render_kwargs={}, + ) + materialization_results, additional_vars = dr.materialize( + # materialize model parameters to json + *materializers, + additional_vars=["classification_report"], + ) + # print(materialization_results["classification_report"]) + # print(additional_vars) From 846b3a61a55e1e8e6bc2d77e1c4a2521631a8eae Mon Sep 17 00:00:00 2001 From: elijahbenizzy Date: Sat, 12 Aug 2023 21:03:57 -0700 Subject: [PATCH 6/8] Sets the default executor to multithreading MUltiprocessing doesn't work in many cases due to the default pickling mechanism being garbage. --- hamilton/driver.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hamilton/driver.py b/hamilton/driver.py index 004534624..dab5f7d08 100644 --- a/hamilton/driver.py +++ b/hamilton/driver.py @@ -1179,9 +1179,7 @@ def build(self) -> Driver: execution_manager = self.execution_manager if execution_manager is None: local_executor = self.local_executor or executors.SynchronousLocalTaskExecutor() - remote_executor = self.remote_executor or executors.MultiProcessingExecutor( - max_tasks=10 - ) + remote_executor = self.remote_executor or executors.MultiThreadingExecutor(max_tasks=10) execution_manager = executors.DefaultExecutionManager( local_executor=local_executor, remote_executor=remote_executor ) From eefe8ae749ce7d6aec08ee63fe2102315a0d6fb8 Mon Sep 17 00:00:00 2001 From: elijahbenizzy Date: Sat, 12 Aug 2023 21:55:49 -0700 Subject: [PATCH 7/8] Fixes issue with name clashes on data loader parameters Previously we would inject a node with a parameter name into a parameter consumed by downstream set of nodes. This would cause name-clashes if, say, the parmeter name was `data`: @load_from.json(...) def foo(data: pd.DataFrame) -> ...: ... To fix this, we did two things: 1. Change the data loader nodes that were created to have namespaces so they're unique 2. Allow the NodeInjector to rename input nodes so it can communicate the new names. Note this is probably slightly more abstraction than needed but I have a sense that NodeInjector will be necessary moving forward (external API calls, etc...). --- hamilton/function_modifiers/adapters.py | 9 +++-- hamilton/function_modifiers/base.py | 48 ++++++++++++++++++++--- tests/function_modifiers/test_adapters.py | 14 +++---- 3 files changed, 54 insertions(+), 17 deletions(-) diff --git a/hamilton/function_modifiers/adapters.py b/hamilton/function_modifiers/adapters.py index a5deb9ae3..062c9263e 100644 --- a/hamilton/function_modifiers/adapters.py +++ b/hamilton/function_modifiers/adapters.py @@ -1,6 +1,6 @@ import inspect import typing -from typing import Any, Callable, Collection, Dict, List, Optional, Tuple, Type +from typing import Any, Callable, Collection, Dict, List, Tuple, Type from hamilton import node from hamilton.function_modifiers.base import ( @@ -139,7 +139,7 @@ def _select_param_to_inject(self, params: List[str], fn: Callable) -> str: def inject_nodes( self, params: Dict[str, Type[Type]], config: Dict[str, Any], fn: Callable - ) -> Optional[Collection[node.Node]]: + ) -> Tuple[Collection[node.Node], Dict[str, str]]: pass """Generates two nodes: 1. A node that loads the data from the data source, and returns that + metadata @@ -217,7 +217,7 @@ def get_input_type_key(key: str) -> str: "hamilton.data_loader.classname": f"{loader_cls.__qualname__}", "hamilton.data_loader.node": inject_parameter, }, - namespace=("load_data", fn.__name__), + namespace=(fn.__name__, "load_data"), ) # the filter node is the node that takes the data from the data source, filters out @@ -239,8 +239,9 @@ def filter_function(_inject_parameter=inject_parameter, **kwargs): "hamilton.data_loader.classname": f"{loader_cls.__qualname__}", "hamilton.data_loader.node": inject_parameter, }, + namespace=(fn.__name__, "select_data"), ) - return [loader_node, filter_node] + return [loader_node, filter_node], {inject_parameter: filter_node.name} def _get_inject_parameter_from_function(self, fn: Callable) -> Tuple[str, Type[Type]]: """Gets the name of the parameter to inject the data into. diff --git a/hamilton/function_modifiers/base.py b/hamilton/function_modifiers/base.py index 808999365..debbe37f9 100644 --- a/hamilton/function_modifiers/base.py +++ b/hamilton/function_modifiers/base.py @@ -10,7 +10,7 @@ except ImportError: # python3.10 and above EllipsisType = type(...) -from typing import Any, Callable, Collection, Dict, List, Optional, Type, Union +from typing import Any, Callable, Collection, Dict, List, Optional, Tuple, Type, Union from hamilton import node, registry, settings @@ -228,6 +228,26 @@ def transform_dag( pass +# TODO -- delete this/replace with the version that will be added by +# https://github.com/DAGWorks-Inc/hamilton/pull/249/ as part of the Node class +def _reassign_input_names(node_: node.Node, input_names: Dict[str, Any]) -> node.Node: + """Reassigns the input names of a node. Useful for applying + a node to a separate input if needed. Note that things can get a + little strange if you have multiple inputs with the same name, so + be careful about how you use this. + :param input_names: Input name map to reassign + :return: A node with the input names reassigned + """ + + def new_callable(**kwargs) -> Any: + reverse_input_names = {v: k for k, v in input_names.items()} + return node_.callable(**{reverse_input_names.get(k, k): v for k, v in kwargs.items()}) + + new_input_types = {input_names.get(k, k): v for k, v in node_.input_types.items()} + out = node_.copy_with(callabl=new_callable, input_types=new_input_types) + return out + + class NodeInjector(SubDAGModifier, abc.ABC): """Injects a value as a source node in the DAG. This is a special case of the SubDAGModifier, which gets all the upstream (required) nodes from the subdag and gives the decorator a chance @@ -275,21 +295,37 @@ def transform_dag( :return: """ injectable_params = NodeInjector.find_injectable_params(nodes) - out = list(nodes) - out.extend(self.inject_nodes(injectable_params, config, fn)) + nodes_to_inject, rename_map = self.inject_nodes(injectable_params, config, fn) + out = [] + for node_ in nodes: + # if there's an intersection then we want to rename the input + if set(node_.input_types.keys()) & set(rename_map.keys()): + out.append(_reassign_input_names(node_, rename_map)) + else: + out.append(node_) + out.extend(nodes_to_inject) + if len(set([node_.name for node_ in out])) != len(out): + import pdb + + pdb.set_trace() + print([node_.name for node_ in out]) return out @abc.abstractmethod def inject_nodes( self, params: Dict[str, Type[Type]], config: Dict[str, Any], fn: Callable - ) -> List[node.Node]: + ) -> Tuple[List[node.Node], Dict[str, str]]: """Adds a set of nodes to inject into the DAG. These get injected into the specified param name, - meaning that exactly one of the output nodes will have that name. + meaning that exactly one of the output nodes will have that name. Note that this also allows + input renaming, meaning that the injector can rename the input to something else (to avoid + name-clashes). :param params: Dictionary of all the type names one wants to inject :param config: Configuration with which the DAG was constructed. :param fn: original function we're decorating. This is useful largely for debugging. - :return: A list of nodes to add. Empty if you wish to inject nothing + :return: A list of nodes to add. Empty if you wish to inject nothing, as well as a dictionary, + allowing the injector to rename the inputs (e.g. if you want the name to be + namespaced to avoid clashes) """ pass diff --git a/tests/function_modifiers/test_adapters.py b/tests/function_modifiers/test_adapters.py index 49cbda1f6..cba3470f2 100644 --- a/tests/function_modifiers/test_adapters.py +++ b/tests/function_modifiers/test_adapters.py @@ -75,17 +75,17 @@ def fn(data: int) -> int: nodes_by_name = {node_.name: node_ for node_ in nodes} assert len(nodes_by_name) == 3 assert "fn" in nodes_by_name - assert nodes_by_name["data"].tags == { + assert nodes_by_name["fn.load_data.data"].tags == { "hamilton.data_loader.source": "mock", "hamilton.data_loader": True, - "hamilton.data_loader.has_metadata": False, + "hamilton.data_loader.has_metadata": True, "hamilton.data_loader.node": "data", "hamilton.data_loader.classname": MockDataLoader.__qualname__, } - assert nodes_by_name["load_data.fn.data"].tags == { + assert nodes_by_name["fn.select_data.data"].tags == { "hamilton.data_loader.source": "mock", "hamilton.data_loader": True, - "hamilton.data_loader.has_metadata": True, + "hamilton.data_loader.has_metadata": False, "hamilton.data_loader.node": "data", "hamilton.data_loader.classname": MockDataLoader.__qualname__, } @@ -333,7 +333,7 @@ def fn_str_inject(injected_data: str) -> str: ) result = fg.execute(inputs={}, nodes=fg.nodes.values()) assert result["fn_str_inject"] == "foo" - assert result["load_data.fn_str_inject.injected_data"] == ( + assert result["fn_str_inject.load_data.injected_data"] == ( "foo", {"loader": "string_data_loader"}, ) @@ -362,12 +362,12 @@ def fn_str_inject(injected_data_1: str, injected_data_2: int) -> str: ) result = fg.execute(inputs={}, nodes=fg.nodes.values()) assert result["fn_str_inject"] == "foofoo" - assert result["load_data.fn_str_inject.injected_data_1"] == ( + assert result["fn_str_inject.load_data.injected_data_1"] == ( "foo", {"loader": "string_data_loader"}, ) - assert result["load_data.fn_str_inject.injected_data_2"] == ( + assert result["fn_str_inject.load_data.injected_data_2"] == ( 2, {"loader": "int_data_loader_2"}, ) From be724aa249419ec186319710faabf1b538ad19c7 Mon Sep 17 00:00:00 2001 From: elijahbenizzy Date: Mon, 14 Aug 2023 17:08:51 -0700 Subject: [PATCH 8/8] Adds documentation/examples for data adapters This has: 1. A reference table, automatically generated using a custom sphinx directive 2. References for the base classes for extension Re (1) we generate a bare-bones table but it should be enough. For now, we just link to the code, but we will, at some point, link to actual class docs. --- docs/conf.py | 1 + docs/data_adapters_extension.py | 288 ++++++++++++++++++ docs/index.md | 1 + docs/reference/io/adapter-documentation.rst | 20 ++ docs/reference/io/available-data-adapters.rst | 56 ++++ docs/reference/io/index.rst | 11 + examples/materialization/README.md | 3 + hamilton/function_modifiers/adapters.py | 16 +- hamilton/io/data_adapters.py | 8 +- hamilton/io/materialization.py | 12 +- requirements-docs.txt | 1 + 11 files changed, 402 insertions(+), 15 deletions(-) create mode 100644 docs/data_adapters_extension.py create mode 100644 docs/reference/io/adapter-documentation.rst create mode 100644 docs/reference/io/available-data-adapters.rst create mode 100644 docs/reference/io/index.rst diff --git a/docs/conf.py b/docs/conf.py index ea274db7d..35f790006 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -30,6 +30,7 @@ "sphinx.ext.autosummary", "myst_parser", "sphinx_sitemap", + "docs.data_adapters_extension", ] # for the sitemap extension --- diff --git a/docs/data_adapters_extension.py b/docs/data_adapters_extension.py new file mode 100644 index 000000000..923c78cfb --- /dev/null +++ b/docs/data_adapters_extension.py @@ -0,0 +1,288 @@ +import dataclasses +import inspect +import os +from typing import List, Optional, Tuple, Type + +import git +from docutils import nodes +from docutils.parsers.rst import Directive + +import hamilton.io.data_adapters +from hamilton import registry + +"""A module to crawl available data adapters and generate documentation for them. +Note these currently link out to the source code on GitHub, but they should +be linking to the documentation instead, which hasn't been generated yet. +""" + +# These have fallbacks for local dev +GIT_URL = os.environ.get("READTHEDOCS_GIT_CLONE_URL", "https://github.com/dagworks-inc/hamilton") +GIT_ID = os.environ.get("READTHEDOCS_GIT_IDENTIFIER", "main") + +# All the modules that register data adapters +# When you register a new one, add it here +MODULES_TO_IMPORT = ["hamilton.io.default_data_loaders", "hamilton.plugins.pandas_extensions"] + +for module in MODULES_TO_IMPORT: + __import__(module) + + +def get_git_root(path: str) -> str: + """Yields the git room of a repo, given an absolute path to + a file within the repo. + + :param path: Path to a file within a git repo + :return: The root of the git repo + """ + git_repo = git.Repo(path, search_parent_directories=True) + git_root = git_repo.git.rev_parse("--show-toplevel") + return git_root + + +@dataclasses.dataclass +class Param: + name: str + type: str + default: Optional[str] = None + + +def get_default(param: dataclasses.Field) -> Optional[str]: + """Gets the deafult of a dataclass field, if it has one. + + :param param: The dataclass field + :return: The str representation of the default. + """ + if param.default is dataclasses.MISSING: + return None + return str(param.default) + + +def get_lines_for_class(class_: Type[Type]) -> Tuple[int, int]: + """Gets the set of lines in which a class is implemented + + :param class_: The class to get the lines for + :return: A tuple of the start and end lines + """ + lines = inspect.getsourcelines(class_) + start_line = lines[1] + end_line = lines[1] + len(lines[0]) + return start_line, end_line + + +def get_class_repr(class_: Type) -> str: + """Gets a representation of a class that can be used in documentation. + + :param class_: Python class to get the representation for + :return: Str representation + """ + + try: + return class_.__qualname__ + except AttributeError: + # This happens when we have generics or other oddities + return str(class_) + + +@dataclasses.dataclass +class AdapterInfo: + key: str + class_name: str + class_path: str + load_params: List[Param] + save_params: List[Param] + applicable_types: List[str] + file_: str + line_nos: Tuple[int, int] + + @staticmethod + def from_loader(loader: Type[hamilton.io.data_adapters.DataLoader]) -> "AdapterInfo": + """Utility constructor to create the AdapterInfo from a DataLoader class + + :param loader: DataLoader class + :return: AdapterInfo derived from it + """ + + return AdapterInfo( + key=loader.name(), + class_name=loader.__name__, + class_path=loader.__module__, + load_params=[ + Param(name=p.name, type=get_class_repr(p.type), default=get_default(p)) + for p in dataclasses.fields(loader) + ] + if issubclass(loader, hamilton.io.data_adapters.DataSaver) + else None, + save_params=[ + Param(name=p.name, type=get_class_repr(p.type), default=get_default(p)) + for p in dataclasses.fields(loader) + ] + if issubclass(loader, hamilton.io.data_adapters.DataSaver) + else None, + applicable_types=[get_class_repr(t) for t in loader.applicable_types()], + file_=inspect.getfile(loader), + line_nos=get_lines_for_class(loader), + ) + + +def _collect_loaders(saver_or_loader: str) -> List[Type[hamilton.io.data_adapters.AdapterCommon]]: + """Collects all loaders from the registry. + + :return: + """ + out = [] + loaders = ( + list(registry.LOADER_REGISTRY.values()) + if saver_or_loader == "loader" + else list(registry.SAVER_REGISTRY.values()) + ) + for classes in loaders: + for cls in classes: + if cls not in out: + out.append(cls) + return out + + +# Utility functions to render different components of the adapter in table cells + + +def render_key(key: str): + return [nodes.Text(key, key)] + + +def render_class_name(class_name: str): + return [nodes.literal(text=class_name)] + + +def render_class_path(class_path: str, file_: str, line_start: int, line_end: int): + git_path = get_git_root(file_) + file_relative_to_git_root = os.path.relpath(file_, git_path) + href = f"{GIT_URL}/blob/{GIT_ID}/{file_relative_to_git_root}#L{line_start}-L{line_end}" + # href = f"{GIT_URL}/blob/{GIT_ID}/{file_}#L{line_no}" + return [nodes.raw("", f'{class_path}', format="html")] + + +def render_adapter_params(load_params: Optional[List[Param]]): + if load_params is None: + return nodes.raw("", "
    ", format="html") + fieldlist = nodes.field_list() + for i, load_param in enumerate(load_params): + fieldname = nodes.Text(load_param.name) + fieldbody = nodes.literal( + text=load_param.type + + ("=" + load_param.default if load_param.default is not None else "") + ) + field = nodes.field("", fieldname, fieldbody) + fieldlist += field + if i < len(load_params) - 1: + fieldlist += nodes.raw("", "
    ", format="html") + return fieldlist + + +def render_applicable_types(applicable_types: List[str]): + fieldlist = nodes.field_list() + for applicable_type in applicable_types: + fieldlist += nodes.field("", nodes.literal(text=applicable_type), nodes.Text("")) + fieldlist += nodes.raw("", "
    ", format="html") + return fieldlist + + +class DataAdapterTableDirective(Directive): + """Custom directive to render a table of all data adapters. Takes in one argument + that is either 'loader' or 'saver' to indicate which adapters to render.""" + + has_content = True + required_arguments = 1 # Number of required arguments + + def run(self): + """Runs the directive. This does the following: + 1. Collects all loaders from the registry + 2. Creates a table with the following columns: + - Key + - Class name + - Class path + - Load params + - Applicable types + 3. Returns the table + :return: A list of nodes that Sphinx will render, consisting of the table node + """ + saver_or_loader = self.arguments[0] + if saver_or_loader not in ("loader", "saver"): + raise ValueError( + f"loader_or_saver must be one of 'loader' or 'saver', " f"got {saver_or_loader}" + ) + table_data = [ + AdapterInfo.from_loader(loader) for loader in _collect_loaders(saver_or_loader) + ] + + # Create the table and add columns + table_node = nodes.table() + tgroup = nodes.tgroup(cols=6) + table_node += tgroup + + # Create columns + key_spec = nodes.colspec(colwidth=1) + # class_spec = nodes.colspec(colwidth=1) + load_params_spec = nodes.colspec(colwidth=2) + applicable_types_spec = nodes.colspec(colwidth=1) + class_path_spec = nodes.colspec(colwidth=1) + + tgroup += [key_spec, load_params_spec, applicable_types_spec, class_path_spec] + + # Create the table body + thead = nodes.thead() + row = nodes.row() + + # Create entry nodes for each cell + key_entry = nodes.entry() + load_params_entry = nodes.entry() + applicable_types_entry = nodes.entry() + class_path_entry = nodes.entry() + + key_entry += nodes.paragraph(text="key") + + load_params_entry += nodes.paragraph(text=f"{saver_or_loader} params") + applicable_types_entry += nodes.paragraph(text="types") + class_path_entry += nodes.paragraph(text="module") + + row += [key_entry, load_params_entry, applicable_types_entry, class_path_entry] + thead += row + tgroup += thead + tbody = nodes.tbody() + tgroup += tbody + + # Populate table rows based on your table_data + for row_data in table_data: + row = nodes.row() + + # Create entry nodes for each cell + key_entry = nodes.entry() + load_params_entry = nodes.entry() + applicable_types_entry = nodes.entry() + class_path_entry = nodes.entry() + + # Create a paragraph node for each entry + # import pdb + # pdb.set_trace() + # para1 = nodes.literal(text=row_data['column1_data']) + # para2 = nodes.paragraph(text=row_data['column2_data']) + + # Add the paragraph nodes to the entry nodes + key_entry += render_key(row_data.key) + load_params_entry += render_adapter_params(row_data.load_params) + applicable_types_entry += render_applicable_types(row_data.applicable_types) + class_path_entry += render_class_path( + row_data.class_path, row_data.file_, *row_data.line_nos + ) + + # Add the entry nodes to the row + row += [key_entry, load_params_entry, applicable_types_entry, class_path_entry] + + # Add the row to the table body + tbody += row + + return [table_node] + + +def setup(app): + """Required to register the extension""" + app.add_directive("data_adapter_table", DataAdapterTableDirective) diff --git a/docs/index.md b/docs/index.md index b82c522e8..ee17efafe 100644 --- a/docs/index.md +++ b/docs/index.md @@ -41,6 +41,7 @@ contributing reference/decorators/index reference/drivers/index +reference/io/index reference/graph-adapters/index reference/result-builders/index reference/miscellaneous/index diff --git a/docs/reference/io/adapter-documentation.rst b/docs/reference/io/adapter-documentation.rst new file mode 100644 index 000000000..6394a4c02 --- /dev/null +++ b/docs/reference/io/adapter-documentation.rst @@ -0,0 +1,20 @@ +========================= +Data Adapters +========================= + +Reference for data adapter base classes: + +.. autoclass:: hamilton.io.data_adapters.DataLoader + :special-members: __init__ + :members: + :inherited-members: + +.. autoclass:: hamilton.io.data_adapters.DataSaver + :special-members: __init__ + :members: + :inherited-members: + +.. autoclass:: hamilton.io.data_adapters.AdapterCommon + :special-members: __init__ + :members: + :inherited-members: diff --git a/docs/reference/io/available-data-adapters.rst b/docs/reference/io/available-data-adapters.rst new file mode 100644 index 000000000..a8c5a2469 --- /dev/null +++ b/docs/reference/io/available-data-adapters.rst @@ -0,0 +1,56 @@ +======================== +Using Data Adapters +======================== + +This is an index of all the available data adapters, both savers and loaders. +Note that some savers and loaders are the same (certain classes can handle both), +but some are different. You will want to reference this when calling out to any of the following: + +1. Using :doc:`/reference/decorators/save_to/`. +2. Using :doc:`/reference/decorators/load_from/`. +3. Using :doc:`materialize ` + +To read these tables, you want to first look at the key to determine which format you want -- +these should be human-readable and familiar to you. Then you'll want to look at the `types` field +to figure out which is the best for your case (the object you want to load from or save to). + +Finally, look up the adapter params to see what parameters you can pass to the data adapters. +The optional params come with their default value specified. + +If you want more information, click on the `module`, it will send you to the code that implements +it to see how the parameters are used. + +As an example, say we wanted to save a pandas dataframe to a CSV file. We would first find the +key `csv`, which would inform us that we want to call `save_to.csv` (or `to.csv` in the case +of `materialize`). Then, we would look at the `types` field, finding that there is a pandas +dataframe adapter. Finally, we would look at the `params` field, finding that we can pass +`path`, and (optionally) `sep` (which we'd realize defaults to `,` when looking at the code). + +All together, we'd end up with: + +.. code-block:: python + + import pandas as pd + from hamilton.function_modifiers import value, save_to + + @save_to.csv(path=value("my_file.csv")) + def my_data(...) -> pd.DataFrame: + ... + +And we're good to go! + +If you want to extend these, see :doc:`/reference/io/available-data-adapters` for documentation, +and `the example `_ +in the repository for an example of how to do so. + +============= +Data Loaders +============= + +.. data_adapter_table:: loader + +============= +Data Savers +============= + +.. data_adapter_table:: saver diff --git a/docs/reference/io/index.rst b/docs/reference/io/index.rst new file mode 100644 index 000000000..637a69c03 --- /dev/null +++ b/docs/reference/io/index.rst @@ -0,0 +1,11 @@ +============== +I/O +============== + +This section contains any information about I/O within Hamilton + +.. toctree:: + :maxdepth: 2 + + available-data-adapters + adapter-documentation diff --git a/examples/materialization/README.md b/examples/materialization/README.md index 24cf6fe2a..3a6c73e8d 100644 --- a/examples/materialization/README.md +++ b/examples/materialization/README.md @@ -23,6 +23,9 @@ locations through a driver call. We demonstrate: See [run.py](run.py) for the full example. +In this example we only pass literal values to the materializers. That said, you can use both `source` (to specify the source from an upstream node), +and `value` (which is the default) to specify literals. + ## `driver.materialize` diff --git a/hamilton/function_modifiers/adapters.py b/hamilton/function_modifiers/adapters.py index 062c9263e..333561516 100644 --- a/hamilton/function_modifiers/adapters.py +++ b/hamilton/function_modifiers/adapters.py @@ -312,7 +312,9 @@ def __getattr__(cls, item: str): f"Available loaders are: {LOADER_REGISTRY.keys()}. " f"If you've gotten to this point, you either (1) spelled the " f"loader name wrong, (2) are trying to use a loader that does" - f"not exist (yet)" + f"not exist (yet). For a list of available loaders, see: " + f"https://hamilton.readthedocs.io/reference/io/available-data-adapters/#data" + f"-loaders " ) from e @@ -425,11 +427,13 @@ def __getattr__(cls, item: str): return super().__getattribute__(item) except AttributeError as e: raise AttributeError( - f"No saver named: {item} available for {cls.__name__}. " - f"Available data savers are: {list(SAVER_REGISTRY.keys())}. " - f"If you've gotten to this point, you either (1) spelled the " - f"loader name wrong, (2) are trying to use a saver that does" - f"not exist (yet)." + "No saver named: {item} available for {cls.__name__}. " + "Available data savers are: {list(SAVER_REGISTRY.keys())}. " + "If you've gotten to this point, you either (1) spelled the " + "loader name wrong, (2) are trying to use a saver that does" + "not exist (yet). For a list of available savers, see " + "https://hamilton.readthedocs.io/reference/io/available-data-adapters/#data" + "-loaders " ) from e diff --git a/hamilton/io/data_adapters.py b/hamilton/io/data_adapters.py index 965a1f20f..b997227e6 100644 --- a/hamilton/io/data_adapters.py +++ b/hamilton/io/data_adapters.py @@ -153,12 +153,12 @@ class DataSaver(AdapterCommon, abc.ABC): @abc.abstractmethod def save_data(self, data: Any) -> Dict[str, Any]: """Saves the data to the data source. - Note this uses the constructor parameters to determine - how to save the data. + Note this uses the constructor parameters to determine + how to save the data. :return: Any relevant metadata. This is up the the data saver, but will likely - include the URI, etc... This is going to be similar to the metadata returned - by the data loader in the loading tuple. + include the URI, etc... This is going to be similar to the metadata returned + by the data loader in the loading tuple. """ pass diff --git a/hamilton/io/materialization.py b/hamilton/io/materialization.py index 0bf6fc742..09952cb82 100644 --- a/hamilton/io/materialization.py +++ b/hamilton/io/materialization.py @@ -28,11 +28,13 @@ def __getattr__(cls, item: str): return super().__getattribute__(item) except AttributeError as e: raise AttributeError( - f"No data materializer named: {item}. " - f"Available materializers are: {SAVER_REGISTRY.keys()}. " - f"If you've gotten to this point, you either (1) spelled the " - f"loader name wrong, (2) are trying to use a loader that does" - f"not exist (yet)" + "No data materializer named: {item}. " + "Available materializers are: {SAVER_REGISTRY.keys()}. " + "If you've gotten to this point, you either (1) spelled the " + "loader name wrong, (2) are trying to use a loader that does" + "not exist (yet). For a list of available materializers, see " + "https://hamilton.readthedocs.io/reference/io/available-data-adapters/#data" + "-loaders " ) from e diff --git a/requirements-docs.txt b/requirements-docs.txt index 36137690b..2403cff5d 100644 --- a/requirements-docs.txt +++ b/requirements-docs.txt @@ -3,6 +3,7 @@ alabaster>=0.7,<0.8,!=0.7.5 # read the docs pins commonmark==0.9.1 # read the docs pins dask[distributed] furo +gitpython # Required for parsing git info for generation of data-adapter docs mock==1.0.1 # read the docs pins myst-parser==0.18.1 # latest version of myst at this time pillow