diff --git a/examples/ibisml/column_dataflow.py b/examples/ibisml/column_dataflow.py deleted file mode 100644 index 92ca41c2c..000000000 --- a/examples/ibisml/column_dataflow.py +++ /dev/null @@ -1,61 +0,0 @@ -from typing import Optional - -import ibis -import ibis.expr.types as ir - -from hamilton.function_modifiers import extract_columns -from hamilton.plugins import ibis_extensions # noqa: F401 - - -# extract specific columns from the table -@extract_columns("son", "pet", "month_of_absence") -def raw_table(raw_data_path: str) -> ir.Table: - """Load the CSV found at `raw_data_path` into a Table expression - and format columns to snakecase - """ - return ibis.read_csv(sources=raw_data_path, table_name="absenteism").rename("snake_case") - - -# accesses a single column from `raw_table` -def has_children(son: ir.Column) -> ir.BooleanColumn: - """True if someone has any children""" - return ibis.ifelse(son > 0, True, False) - - -# narrows the return type from `ir.Column` to `ir.BooleanColumn` -def has_pet(pet: ir.Column) -> ir.BooleanColumn: - """True if someone has any pets""" - return ibis.ifelse(pet > 0, True, False).cast(bool) - - -# typing and docstring provides business context to features -def is_summer_brazil(month_of_absence: ir.Column) -> ir.BooleanColumn: - """True if it is summer in Brazil during this month - - People in the northern hemisphere are likely to take vacations - to warm places when it's cold locally - """ - return month_of_absence.isin([1, 2, 12]) - - -def feature_table( - raw_table: ir.Table, - has_children: ir.BooleanColumn, - has_pet: ir.BooleanColumn, - is_summer_brazil: ir.BooleanColumn, -) -> ir.Table: - """Join computed features to the `raw_data` table""" - return raw_table.mutate( - has_children=has_children, - has_pet=has_pet, - is_summer_brazil=is_summer_brazil, - ) - - -def feature_set( - feature_table: ir.Table, - feature_selection: list[str], - condition: Optional[ibis.common.deferred.Deferred] = None, -) -> ir.Table: - """Select feature columns and filter rows""" - return feature_table[feature_selection].filter(condition) diff --git a/examples/ibisml/cross_validation.png b/examples/ibisml/cross_validation.png index 71d11f8e2..d44453340 100644 Binary files a/examples/ibisml/cross_validation.png and b/examples/ibisml/cross_validation.png differ diff --git a/examples/ibisml/ibis_feature_set.png b/examples/ibisml/ibis_feature_set.png deleted file mode 100644 index c93e16ee5..000000000 Binary files a/examples/ibisml/ibis_feature_set.png and /dev/null differ diff --git a/examples/ibisml/model_training.py b/examples/ibisml/model_training.py index 73fbc5955..2beb019f3 100644 --- a/examples/ibisml/model_training.py +++ b/examples/ibisml/model_training.py @@ -79,11 +79,11 @@ def prepare_data( train = transform(train_set) df_train = train.to_pandas() X_train = df_train[train.features] - y_train = df_train[train.outcomes] + y_train = df_train[train.outcomes].to_numpy().reshape(-1) df_test = transform(val_set).to_pandas() X_val = df_test[train.features] - y_val = df_test[train.outcomes] + y_val = df_test[train.outcomes].to_numpy().reshape(-1) return dict( X_train=X_train, @@ -161,7 +161,7 @@ def train_full_model( data = transform(feature_set) df = data.to_pandas() X = df[data.features] - y = df[data.outcomes] + y = df[data.outcomes].to_numpy().reshape(-1) base_model.fit(X, y) return dict( diff --git a/examples/ibisml/run.py b/examples/ibisml/run.py index ae013ecfc..6822c6775 100644 --- a/examples/ibisml/run.py +++ b/examples/ibisml/run.py @@ -1,6 +1,6 @@ from hamilton import driver -from hamilton.plugins.h_tqdm import ProgressBar from hamilton.execution.executors import SynchronousLocalTaskExecutor +from hamilton.plugins.h_tqdm import ProgressBar def view_expression(expression, **kwargs): @@ -16,33 +16,17 @@ def view_expression(expression, **kwargs): return dot -def main(level: str, model: str): - dataflow_components = [] - config = {} - final_vars = ["feature_set"] - - if level == "column": - import column_dataflow - - dataflow_components.append(column_dataflow) - elif level == "table": - import table_dataflow - - dataflow_components.append(table_dataflow) - else: - raise ValueError("`level` must be in ['column', 'table']") - - if model: - import model_training +def main(model: str): + import model_training + import table_dataflow - dataflow_components.append(model_training) - config["model"] = model - final_vars.extend(["full_model", "fitted_recipe", "cross_validation_scores"]) + config = {"model": model} + final_vars = ["full_model", "fitted_recipe", "cross_validation_scores"] # build the Driver from modules dr = ( driver.Builder() - .with_modules(*dataflow_components) + .with_modules(table_dataflow, model_training) .with_config(config) .with_adapters(ProgressBar()) .enable_dynamic_execution(allow_experimental_mode=True) @@ -68,7 +52,6 @@ def main(level: str, model: str): ) res = dr.execute(final_vars, inputs=inputs) - view_expression(res["feature_set"], filename="ibis_feature_set", format="png") print("Dataflow result keys: ", list(res.keys())) @@ -77,8 +60,7 @@ def main(level: str, model: str): import argparse parser = argparse.ArgumentParser() - parser.add_argument("--level", choices=["column", "table"]) parser.add_argument("--model", choices=["linear", "random_forest", "boosting"]) args = parser.parse_args() - main(level=args.level, model=args.model) + main(model=args.model)