diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 3a4e2b6c69..082eb3face 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -54,10 +54,15 @@ jobs: pip install -e . python -c "import flaml" pip install -e .[test] - - name: On Ubuntu python 3.8, install pyspark 3.2.3 - if: matrix.python-version == '3.8' && matrix.os == 'ubuntu-latest' + - name: On Ubuntu python 3.10, install pyspark 3.4.1 + if: matrix.python-version == '3.10' && matrix.os == 'ubuntu-latest' run: | - pip install pyspark==3.2.3 + pip install pyspark==3.4.1 + pip list | grep "pyspark" + - name: On Ubuntu python 3.11, install pyspark 3.5.1 + if: matrix.python-version == '3.11' && matrix.os == 'ubuntu-latest' + run: | + pip install pyspark==3.5.1 pip list | grep "pyspark" - name: If linux and python<3.11, install ray 2 if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.11' @@ -77,11 +82,6 @@ jobs: if: matrix.python-version == '3.8' || matrix.python-version == '3.9' run: | pip install -e .[vw] - - name: Uninstall pyspark on (python 3.9) or windows - if: matrix.python-version == '3.9' || matrix.os == 'windows-2019' - run: | - # Uninstall pyspark to test env without pyspark - pip uninstall -y pyspark - name: Test with pytest if: matrix.python-version != '3.10' run: | diff --git a/.gitignore b/.gitignore index 9dc1eea63c..8a3365b203 100644 --- a/.gitignore +++ b/.gitignore @@ -163,6 +163,24 @@ output/ flaml/tune/spark/mylearner.py *.pkl +data/ +benchmark/pmlb/csv_datasets +benchmark/*.csv + +checkpoints/ +test/default +test/housing.json +test/nlp/default/transformer_ms/seq-classification.json + +flaml/fabric/fanova/_fanova.c # local config files *.config.local + +local_debug/ patch.diff + +# Test things +notebook/lightning_logs/ +lightning_logs/ +flaml/autogen/extensions/tmp/ +test/autogen/my_tmp/ diff --git a/flaml/__init__.py b/flaml/__init__.py index ab323377fb..8664127e3a 100644 --- a/flaml/__init__.py +++ b/flaml/__init__.py @@ -1,6 +1,11 @@ import logging -from flaml.automl import AutoML, logger_formatter +try: + from flaml.automl import AutoML, logger_formatter + + has_automl = True +except ImportError: + has_automl = False from flaml.onlineml.autovw import AutoVW from flaml.tune.searcher import CFO, FLOW2, BlendSearch, BlendSearchTuner, RandomSearch from flaml.version import __version__ @@ -8,3 +13,6 @@ # Set the root logger. logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) + +if not has_automl: + logger.warning("flaml.automl is not available. Please install flaml[automl] to enable AutoML functionalities.") diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py index 77023987ee..115f9748d0 100644 --- a/flaml/automl/automl.py +++ b/flaml/automl/automl.py @@ -7,6 +7,7 @@ import json import logging import os +import random import sys import time from functools import partial @@ -16,7 +17,7 @@ from flaml import tune from flaml.automl.logger import logger, logger_formatter -from flaml.automl.ml import train_estimator +from flaml.automl.ml import huggingface_metric_to_mode, sklearn_metric_name_set, spark_metric_name_dict, train_estimator from flaml.automl.spark import DataFrame, Series, psDataFrame, psSeries from flaml.automl.state import AutoMLState, SearchState from flaml.automl.task.factory import task_factory @@ -45,6 +46,7 @@ try: from sklearn.base import BaseEstimator + from sklearn.pipeline import Pipeline except ImportError: BaseEstimator = object ERROR = ERROR or ImportError("please install flaml[automl] option to use the flaml.automl package.") @@ -54,6 +56,14 @@ except ImportError: mlflow = None +try: + from flaml.fabric.mlflow import MLflowIntegration, get_mlflow_log_latency, infer_signature, is_autolog_enabled + + internal_mlflow = True +except ImportError: + internal_mlflow = False + + try: from ray import __version__ as ray_version @@ -171,7 +181,7 @@ def custom_metric( 'better' only logs configs with better loss than previos iters 'all' logs all the tried configs. model_history: A boolean of whether to keep the best - model per estimator. Make sure memory is large enough if setting to True. + model per estimator. Make sure memory is large enough if setting to True. Default False. log_training_metric: A boolean of whether to log the training metric for each model. mem_thres: A float of the memory size constraint in bytes. @@ -247,7 +257,10 @@ def custom_metric( search is considered to converge. force_cancel: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. - append_log: boolean, default=False | Whether to directly append the log + mlflow_exp_name: str, default=None | The name of the mlflow experiment. This should be specified if + enable mlflow autologging on Spark. Otherwise it will log all the results into the experiment of the + same name as the basename of main entry file. + append_log: boolean, default=False | Whetehr to directly append the log records to the input log file if it exists. auto_augment: boolean, default=True | Whether to automatically augment rare classes. @@ -320,9 +333,7 @@ def custom_metric( } } ``` - mlflow_logging: boolean, default=True | Whether to log the training results to mlflow. - This requires mlflow to be installed and to have an active mlflow run. - FLAML will create nested runs. + mlflow_logging: boolean, default=True | Whether to log the training results to mlflow. Not valid if mlflow is not installed. """ if ERROR: @@ -331,6 +342,8 @@ def custom_metric( self._state = AutoMLState() self._state.learner_classes = {} self._settings = settings + self._automl_user_configurations = settings.copy() + self._settings.pop("automl_user_configurations", None) # no budget by default settings["time_budget"] = settings.get("time_budget", -1) settings["task"] = settings.get("task", "classification") @@ -362,6 +375,7 @@ def custom_metric( settings["preserve_checkpoint"] = settings.get("preserve_checkpoint", True) settings["early_stop"] = settings.get("early_stop", False) settings["force_cancel"] = settings.get("force_cancel", False) + settings["mlflow_exp_name"] = settings.get("mlflow_exp_name", None) settings["append_log"] = settings.get("append_log", False) settings["min_sample_size"] = settings.get("min_sample_size", MIN_SAMPLE_TRAIN) settings["use_ray"] = settings.get("use_ray", False) @@ -377,6 +391,7 @@ def custom_metric( settings["mlflow_logging"] = settings.get("mlflow_logging", True) self._estimator_type = "classifier" if settings["task"] in CLASSIFICATION else "regressor" + self.best_run_id = None def get_params(self, deep: bool = False) -> dict: return self._settings.copy() @@ -475,14 +490,29 @@ def save_best_config(self, filename): with open(filename, "w") as f: json.dump(best, f) + @property + def supported_metrics(self): + """ + Returns a tuple of supported metrics for the task. + + Returns: + metrics (Tuple): sklearn metrics from sklearn package; + huggingface metrics from datasets package; + spark metrics from pyspark package + + """ + + return sklearn_metric_name_set, huggingface_metric_to_mode.keys(), spark_metric_name_dict + @property def feature_transformer(self): - """Returns feature transformer which is used to preprocess data before applying training or inference.""" - return getattr(self, "_transformer", None) + """Returns AutoML Transformer""" + data_precessor = getattr(self, "_transformer", None) + return data_precessor @property def label_transformer(self): - """Returns label transformer which is used to preprocess labels before scoring, and inverse transform labels after inference.""" + """Returns AutoML label transformer""" return getattr(self, "_label_transformer", None) @property @@ -779,7 +809,7 @@ def retrain_from_log( max_epochs: int, default = 20 | Maximum number of epochs to run training, only used by TemporalFusionTransformerEstimator. batch_size: int, default = 64 | Batch size for training model, only - used by TemporalFusionTransformerEstimator. + used by TemporalFusionTransformerEstimator and TCNEstimator. """ task = task or self._settings.get("task") if isinstance(task, str): @@ -1201,6 +1231,7 @@ def fit( skip_transform=None, mlflow_logging=None, fit_kwargs_by_estimator=None, + mlflow_exp_name=None, **fit_kwargs, ): """Find a model for a given task. @@ -1294,7 +1325,7 @@ def custom_metric( 'all' logs all the tried configs. model_history: A boolean of whether to keep the trained best model per estimator. Make sure memory is large enough if setting to True. - Default value is False: best_model_for_estimator would return a + Default value is False. If False, best_model_for_estimator would return a untrained model for non-best learner. log_training_metric: A boolean of whether to log the training metric for each model. @@ -1380,7 +1411,10 @@ def custom_metric( early_stop: boolean, default=False | Whether to stop early if the search is considered to converge. force_cancel: boolean, default=False | Whether to forcely cancel the PySpark job if overtime. - append_log: boolean, default=False | Whether to directly append the log + mlflow_exp_name: str, default=None | The name of the mlflow experiment. This should be specified if + enable mlflow autologging on Spark. Otherwise it will log all the results into the experiment of the + same name as the basename of main entry file. + append_log: boolean, default=False | Whetehr to directly append the log records to the input log file if it exists. auto_augment: boolean, default=True | Whether to automatically augment rare classes. @@ -1465,9 +1499,7 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds): skip_transform: boolean, default=False | Whether to pre-process data prior to modeling. mlflow_logging: boolean, default=None | Whether to log the training results to mlflow. Default value is None, which means the logging decision is made based on - AutoML.__init__'s mlflow_logging argument. - This requires mlflow to be installed and to have an active mlflow run. - FLAML will create nested runs. + AutoML.__init__'s mlflow_logging argument. Not valid if mlflow is not installed. fit_kwargs_by_estimator: dict, default=None | The user specified keywords arguments, grouped by estimator name. For TransformersEstimator, available fit_kwargs can be found from [TrainingArgumentsForAuto](nlp/huggingface/training_args). @@ -1517,7 +1549,7 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds): max_epochs: int, default = 20 | Maximum number of epochs to run training, only used by TemporalFusionTransformerEstimator. batch_size: int, default = 64 | Batch size for training model, only - used by TemporalFusionTransformerEstimator. + used by TemporalFusionTransformerEstimator and TCNEstimator. """ self._state._start_time_flag = self._start_time_flag = time.time() @@ -1568,6 +1600,7 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds): ) early_stop = self._settings.get("early_stop") if early_stop is None else early_stop force_cancel = self._settings.get("force_cancel") if force_cancel is None else force_cancel + mlflow_exp_name = self._settings.get("mlflow_exp_name") if mlflow_exp_name is None else mlflow_exp_name # no search budget is provided? no_budget = time_budget < 0 and max_iter is None and not early_stop append_log = self._settings.get("append_log") if append_log is None else append_log @@ -1620,7 +1653,6 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds): self._use_ray = use_ray # use the following condition if we have an estimation of average_trial_time and average_trial_overhead # self._use_ray = use_ray or n_concurrent_trials > ( average_trial_time + average_trial_overhead) / (average_trial_time) - if self._use_ray is not False: import ray @@ -1654,11 +1686,29 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds): self._state.fit_kwargs = fit_kwargs custom_hp = custom_hp or self._settings.get("custom_hp") self._skip_transform = self._settings.get("skip_transform") if skip_transform is None else skip_transform - self._mlflow_logging = self._settings.get("mlflow_logging") if mlflow_logging is None else mlflow_logging + self._mlflow_logging = ( + False + if mlflow is None + else self._settings.get("mlflow_logging") + if mlflow_logging is None + else mlflow_logging + ) fit_kwargs_by_estimator = fit_kwargs_by_estimator or self._settings.get("fit_kwargs_by_estimator") self._state.fit_kwargs_by_estimator = fit_kwargs_by_estimator.copy() # shallow copy of fit_kwargs_by_estimator self._state.weight_val = sample_weight_val - + self._mlflow_exp_name = mlflow_exp_name + self.mlflow_integration = None + self.autolog_extra_tag = { + "extra_tag.sid": f"flaml_{flaml_version}_{int(time.time())}_{random.randint(1001, 9999)}" + } + if internal_mlflow and self._mlflow_logging and (mlflow.active_run() or is_autolog_enabled()): + try: + self.mlflow_integration = MLflowIntegration("automl", mlflow_exp_name, extra_tag=self.autolog_extra_tag) + self._mlflow_exp_name = self.mlflow_integration.experiment_name + if not (mlflow.active_run() is not None or is_autolog_enabled()): + self.mlflow_integration.only_history = True + except KeyError: + print("Not in Fabric, Skipped") task.validate_data( self, self._state, @@ -1723,6 +1773,11 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds): self._min_sample_size_input = min_sample_size self._prepare_data(eval_method, split_ratio, n_splits) + # infer the signature of the input/output data + if self.mlflow_integration is not None: + self.estimator_signature = infer_signature(self._state.X_train, self._state.y_train) + self.pipeline_signature = infer_signature(X_train, y_train, dataframe, label) + # TODO pull this to task as decide_sample_size if isinstance(self._min_sample_size, dict): self._sample = { @@ -1821,6 +1876,11 @@ def is_to_reverse_metric(metric, task): and (max_iter > 0 or retrain_full is True) or max_iter == 1 ) + if self.mlflow_integration is not None and all( + [self.mlflow_integration.parent_run_id is None, not self.mlflow_integration.only_history] + ): + # force not retrain if no active run + self._state.retrain_final = False # add custom learner for estimator_name in estimator_list: if estimator_name not in self._state.learner_classes: @@ -1953,6 +2013,8 @@ def is_to_reverse_metric(metric, task): ) # NOTE: this is after kwargs is updated to fit_kwargs_by_estimator del self._state.groups, self._state.groups_all, self._state.groups_val logger.setLevel(old_level) + if self.mlflow_integration is not None: + self.mlflow_integration.resume_mlflow() def _search_parallel(self): if self._use_ray is not False: @@ -2049,6 +2111,14 @@ def _search_parallel(self): if self._use_spark: # use spark as parallel backend + mlflow_log_latency = ( + get_mlflow_log_latency(model_history=self._state.model_history) if self.mlflow_integration else 0 + ) + ( + logger.info(f"Estimated mlflow_log_latency: {mlflow_log_latency} seconds.") + if mlflow_log_latency > 0 + else None + ) analysis = tune.run( self.trainable, search_alg=search_alg, @@ -2061,6 +2131,9 @@ def _search_parallel(self): use_ray=False, use_spark=True, force_cancel=self._force_cancel, + mlflow_exp_name=self._mlflow_exp_name, + automl_info=(mlflow_log_latency,), # pass automl info to tune.run + extra_tag=self.autolog_extra_tag, # raise_on_failed_trial=False, # keep_checkpoints_num=1, # checkpoint_score_attr="min-val_loss", @@ -2121,6 +2194,8 @@ def _search_parallel(self): self._search_states[estimator].best_config = config if better or self._log_type == "all": self._log_trial(search_state, estimator) + if self.mlflow_integration: + self.mlflow_integration.record_state(self, search_state, estimator) def _log_trial(self, search_state, estimator): if self._training_log: @@ -2134,36 +2209,6 @@ def _log_trial(self, search_state, estimator): estimator, search_state.sample_size, ) - if self._mlflow_logging and mlflow is not None and mlflow.active_run(): - with mlflow.start_run(nested=True): - mlflow.log_metric("iter_counter", self._track_iter) - if (search_state.metric_for_logging is not None) and ( - "intermediate_results" in search_state.metric_for_logging - ): - for each_entry in search_state.metric_for_logging["intermediate_results"]: - with mlflow.start_run(nested=True): - mlflow.log_metrics(each_entry) - mlflow.log_metric("iter_counter", self._iter_per_learner[estimator]) - del search_state.metric_for_logging["intermediate_results"] - if search_state.metric_for_logging: - mlflow.log_metrics(search_state.metric_for_logging) - mlflow.log_metric("trial_time", search_state.trial_time) - mlflow.log_metric("wall_clock_time", self._state.time_from_start) - mlflow.log_metric("validation_loss", search_state.val_loss) - mlflow.log_params(search_state.config) - mlflow.log_param("learner", estimator) - mlflow.log_param("sample_size", search_state.sample_size) - mlflow.log_metric("best_validation_loss", search_state.best_loss) - mlflow.log_param("best_config", search_state.best_config) - mlflow.log_param("best_learner", self._best_estimator) - mlflow.log_metric( - self._state.metric if isinstance(self._state.metric, str) else self._state.error_metric, - 1 - search_state.val_loss - if self._state.error_metric.startswith("1-") - else -search_state.val_loss - if self._state.error_metric.startswith("-") - else search_state.val_loss, - ) def _search_sequential(self): try: @@ -2317,9 +2362,18 @@ def _search_sequential(self): verbose=max(self.verbose - 3, 0), use_ray=False, use_spark=False, + force_cancel=self._force_cancel, + mlflow_exp_name=self._mlflow_exp_name, + automl_info=(0,), # pass automl info to tune.run + extra_tag=self.autolog_extra_tag, ) time_used = time.time() - start_run_time better = False + ( + logger.debug(f"result in automl: {analysis.trials}, {analysis.trials[-1].last_result}") + if analysis.trials + else logger.debug("result in automl: [], None") + ) if analysis.trials and analysis.trials[-1].last_result: result = analysis.trials[-1].last_result search_state.update(result, time_used=time_used) @@ -2382,6 +2436,8 @@ def _search_sequential(self): search_state.trained_estimator.cleanup() if better or self._log_type == "all": self._log_trial(search_state, estimator) + if self.mlflow_integration: + self.mlflow_integration.record_state(self, search_state, estimator) logger.info( " at {:.1f}s,\testimator {}'s best error={:.4f},\tbest estimator {}'s best error={:.4f}".format( @@ -2482,6 +2538,12 @@ def _search(self): self._training_log.checkpoint() self._state.time_from_start = time.time() - self._start_time_flag if self._best_estimator: + if self.mlflow_integration: + self.mlflow_integration.log_automl(self) + if mlflow.active_run() is None: + if self.mlflow_integration.parent_run_id is not None and self.mlflow_integration.autolog: + # ensure result of retrain autolog to parent run + mlflow.start_run(run_id=self.mlflow_integration.parent_run_id) self._selected = self._search_states[self._best_estimator] self.modelcount = sum(search_state.total_iter for search_state in self._search_states.values()) if self._trained_estimator: @@ -2618,11 +2680,34 @@ def _search(self): self._best_estimator, state.best_config, self.data_size_full, + is_retrain=True, ) logger.info(f"retrain {self._best_estimator} for {retrain_time:.1f}s") state.best_config_train_time = retrain_time if self._trained_estimator: logger.info(f"retrained model: {self._trained_estimator.model}") + if self.best_run_id is not None: + logger.info(f"Best MLflow run name: {self.best_run_name}") + logger.info(f"Best MLflow run id: {self.best_run_id}") + if self.mlflow_integration is not None: + # try log retrained model + if all( + [ + self.mlflow_integration.manual_log, + not self.mlflow_integration.has_model, + self.mlflow_integration.parent_run_id is not None, + ] + ): + if mlflow.active_run() is None: + mlflow.start_run(run_id=self.mlflow_integration.parent_run_id) + self.mlflow_integration.log_model( + self._trained_estimator.model, + self.best_estimator, + signature=self.estimator_signature, + ) + self.mlflow_integration.pickle_and_log_automl_artifacts( + self, self.model, self.best_estimator, signature=self.pipeline_signature + ) else: logger.info("not retraining because the time budget is too small.") @@ -2696,3 +2781,7 @@ def _select_estimator(self, estimator_list): q += inv[i] / s if p < q: return estimator_list[i] + + @property + def automl_pipeline(self): + return None diff --git a/flaml/automl/ml.py b/flaml/automl/ml.py index 4f39a09889..bd13d8259e 100644 --- a/flaml/automl/ml.py +++ b/flaml/automl/ml.py @@ -13,6 +13,7 @@ from flaml.automl.spark import ERROR as SPARK_ERROR from flaml.automl.spark import DataFrame, Series, psDataFrame, psSeries from flaml.automl.task.task import Task +from flaml.automl.time_series import TimeSeriesDataset try: from sklearn.metrics import ( @@ -33,7 +34,6 @@ if SPARK_ERROR is None: from flaml.automl.spark.metrics import spark_metric_loss_score -from flaml.automl.time_series import TimeSeriesDataset logger = logging.getLogger(__name__) @@ -89,6 +89,11 @@ "wer": "min", } huggingface_submetric_to_metric = {"rouge1": "rouge", "rouge2": "rouge"} +spark_metric_name_dict = { + "Regression": ["r2", "rmse", "mse", "mae", "var"], + "Binary Classification": ["pr_auc", "roc_auc"], + "Multi-class Classification": ["accuracy", "log_loss", "f1", "micro_f1", "macro_f1"], +} def metric_loss_score( @@ -122,7 +127,7 @@ def metric_loss_score( import datasets datasets_metric_name = huggingface_submetric_to_metric.get(metric_name, metric_name.split(":")[0]) - metric = datasets.load_metric(datasets_metric_name) + metric = datasets.load_metric(datasets_metric_name, trust_remote_code=True) metric_mode = huggingface_metric_to_mode[datasets_metric_name] if metric_name.startswith("seqeval"): @@ -334,6 +339,14 @@ def compute_estimator( if fit_kwargs is None: fit_kwargs = {} + fe_params = {} + for param, value in config_dic.items(): + if param.startswith("fe."): + fe_params[param] = value + + for param, value in fe_params.items(): + config_dic.pop(param) + estimator_class = estimator_class or task.estimator_class_from_str(estimator_name) estimator = estimator_class( **config_dic, @@ -401,12 +414,21 @@ def train_estimator( free_mem_ratio=0, ) -> Tuple[EstimatorSubclass, float]: start_time = time.time() + fe_params = {} + for param, value in config_dic.items(): + if param.startswith("fe."): + fe_params[param] = value + + for param, value in fe_params.items(): + config_dic.pop(param) + estimator_class = estimator_class or task.estimator_class_from_str(estimator_name) estimator = estimator_class( **config_dic, task=task, n_jobs=n_jobs, ) + if fit_kwargs is None: fit_kwargs = {} diff --git a/flaml/automl/model.py b/flaml/automl/model.py index b451f60c59..8d7c15fcce 100644 --- a/flaml/automl/model.py +++ b/flaml/automl/model.py @@ -2,6 +2,7 @@ # * Copyright (c) FLAML authors. All rights reserved. # * Licensed under the MIT License. See LICENSE file in the # * project root for license information. +import inspect import logging import math import os @@ -9,52 +10,41 @@ import signal import sys import time +import warnings from contextlib import contextmanager from functools import partial from typing import Callable, List, Union import numpy as np +import sklearn +from sklearn.dummy import DummyClassifier, DummyRegressor +from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor, RandomForestClassifier, RandomForestRegressor +from sklearn.exceptions import ConvergenceWarning +from sklearn.linear_model import ElasticNet, LassoLars, LogisticRegression, SGDClassifier, SGDRegressor +from sklearn.preprocessing import Normalizer +from sklearn.svm import LinearSVC +from xgboost import __version__ as xgboost_version from flaml import tune -from flaml.automl.data import ( - group_counts, -) +from flaml.automl.data import group_counts +from flaml.automl.spark import ERROR as SPARK_ERROR +from flaml.automl.spark import DataFrame, Series, psDataFrame, psSeries, sparkDataFrame +from flaml.automl.spark.utils import len_labels, to_pandas_on_spark from flaml.automl.task.factory import task_factory -from flaml.automl.task.task import ( - NLG_TASKS, - SEQCLASSIFICATION, - SEQREGRESSION, - SUMMARIZATION, - TOKENCLASSIFICATION, - Task, -) +from flaml.automl.task.task import NLG_TASKS, SEQCLASSIFICATION, SEQREGRESSION, SUMMARIZATION, TOKENCLASSIFICATION, Task + +SKLEARN_VERSION = sklearn.__version__ + +warnings.filterwarnings("ignore", category=ConvergenceWarning) -try: - from sklearn.dummy import DummyClassifier, DummyRegressor - from sklearn.ensemble import ( - ExtraTreesClassifier, - ExtraTreesRegressor, - RandomForestClassifier, - RandomForestRegressor, - ) - from sklearn.linear_model import LogisticRegression - from xgboost import __version__ as xgboost_version -except ImportError: - pass try: from scipy.sparse import issparse except ImportError: - pass -from flaml.automl.spark import ERROR as SPARK_ERROR -from flaml.automl.spark import DataFrame, Series, psDataFrame, psSeries, sparkDataFrame -from flaml.automl.spark.configs import ( - ParamList_LightGBM_Classifier, - ParamList_LightGBM_Ranker, - ParamList_LightGBM_Regressor, -) -from flaml.automl.spark.utils import len_labels, to_pandas_on_spark + def issparse(x): + return False + if DataFrame is not None: from pandas import to_datetime @@ -248,6 +238,8 @@ def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs): Returns: train_time: A float of the training time in seconds. """ + if "is_retrain" in kwargs: + kwargs.pop("is_retrain") if ( getattr(self, "limit_resource", None) and resource is not None @@ -461,6 +453,8 @@ def fit( Returns: train_time: A float of the training time in seconds. """ + if "is_retrain" in kwargs: + kwargs.pop("is_retrain") df_train, label_col = self._preprocess(X_train, y_train, index_col=index_col, return_label=True) kwargs["labelCol"] = label_col train_time = self._fit(df_train, **kwargs) @@ -471,11 +465,10 @@ def _fit(self, df_train: sparkDataFrame, **kwargs): pipeline_model = self.estimator_class(**self.params, **kwargs) if logger.level == logging.DEBUG: logger.debug(f"flaml.automl.model - {pipeline_model} fit started with params {self.params}") - pipeline_model.fit(df_train) + self._model = pipeline_model.fit(df_train) if logger.level == logging.DEBUG: logger.debug(f"flaml.automl.model - {pipeline_model} fit finished") train_time = time.time() - current_time - self._model = pipeline_model return train_time def predict(self, X, index_col="tmp_index_col", return_all=False, **kwargs): @@ -527,6 +520,13 @@ class j. logger.warning("Estimator is not fit yet. Please run fit() before predict().") return np.ones(X.shape[0]) + @property + def estimator_params(self): + if hasattr(self, "estimator_class") and self.estimator_class is not None: + return list(inspect.signature(self.estimator_class).parameters.keys()) + else: + return [] + class SparkLGBMEstimator(SparkEstimator): """The class for fine-tuning spark version lightgbm models, using SynapseML API.""" @@ -602,7 +602,6 @@ def __init__(self, task="binary", **config): raise ImportError(err_msg) self.estimator_class = LightGBMRegressor - self.estimator_params = ParamList_LightGBM_Regressor elif "rank" == task: try: from synapse.ml.lightgbm import LightGBMRanker @@ -610,7 +609,6 @@ def __init__(self, task="binary", **config): raise ImportError(err_msg) self.estimator_class = LightGBMRanker - self.estimator_params = ParamList_LightGBM_Ranker else: try: from synapse.ml.lightgbm import LightGBMClassifier @@ -618,7 +616,6 @@ def __init__(self, task="binary", **config): raise ImportError(err_msg) self.estimator_class = LightGBMClassifier - self.estimator_params = ParamList_LightGBM_Classifier self._time_per_iter = None self._train_size = 0 self._mem_per_iter = -1 @@ -634,6 +631,8 @@ def fit( index_col="tmp_index_col", **kwargs, ): + if "is_retrain" in kwargs: + kwargs.pop("is_retrain") start_time = time.time() if self.model_n_classes_ is None and self._task not in ["regression", "rank"]: self.model_n_classes_, self.model_classes_ = len_labels(y_train, return_labels=True) @@ -703,6 +702,8 @@ def fit( def _fit(self, df_train: sparkDataFrame, **kwargs): current_time = time.time() + if "dataTransferMode" not in kwargs: + kwargs["dataTransferMode"] = "bulk" model = self.estimator_class(**self.params, **kwargs) if logger.level == logging.DEBUG: logger.debug(f"flaml.automl.model - {model} fit started with params {self.params}") @@ -715,6 +716,138 @@ def _fit(self, df_train: sparkDataFrame, **kwargs): return train_time +class SparkRandomForestEstimator(SparkEstimator): + """The SparkEstimator class for Random Forest.""" + + nrows = 101 + ITER_HP = "maxIter" + + @classmethod + def search_space(cls, data_size, task, **params): + SparkRandomForestEstimator.nrows = int(data_size[0]) + upper = min(2048, SparkRandomForestEstimator.nrows) + init = 1 / np.sqrt(data_size[1]) if task.is_classification() else 1 + lower = min(0.1, init) + # upper = max(5, min(32768, int(data_size[0]))) # upper must be larger than lower + + space = { + "numTrees": { + "domain": tune.lograndint(lower=4, upper=max(5, upper)), + "init_value": 4, + "low_cost_init_value": 4, + }, + "featureSubsetStrategy": { + "domain": tune.loguniform(lower=lower, upper=1.0), + "init_value": init, + }, + "maxDepth": { + "domain": tune.lograndint( + lower=4, + upper=max(5, min(32768, SparkRandomForestEstimator.nrows >> 1)), # + ), + "init_value": 4, + "low_cost_init_value": 4, + }, + } + + if task.is_classification(): + space["impurity"] = { + "domain": tune.choice(["gini", "entropy"]), + # "init_value": "gini", + } + + return space + + def __init__(self, task="classification", **config): + super().__init__(task, **config) + if "verbose" in self.params: + self.params.pop("verbose") + if "n_jobs" in self.params: + self.params.pop("n_jobs") + if self._task.is_classification(): + from pyspark.ml.classification import RandomForestClassifier + + self.estimator_class = RandomForestClassifier + else: + from pyspark.ml.regression import RandomForestRegressor + + self.estimator_class = RandomForestRegressor + + self._task = task + self._model = None + self._time_per_iter = None + self._train_size = 0 + self._mem_per_iter = -1 + self.model_classes_ = None + self.model_n_classes_ = None + + def fit( + self, + X_train, + y_train=None, + budget=None, + free_mem_ratio=0, + index_col="tmp_index_col", + **kwargs, + ): + if "is_retrain" in kwargs: + kwargs.pop("is_retrain") + start_time = time.time() + if self.model_n_classes_ is None and self._task not in ["regression", "rank"]: + self.model_n_classes_, self.model_classes_ = len_labels(y_train, return_labels=True) + df_train, label_col = self._preprocess(X_train, y_train, index_col=index_col, return_label=True) + _kwargs = kwargs.copy() + # TODO: update regression model and rank model, update ParamList_LightGBM_ + if self._task not in ["regression", "rank"]: + if "objective" not in _kwargs: + _kwargs["objective"] = "binary" if self.model_n_classes_ == 2 else "multiclass" + for k in list(_kwargs.keys()): + if k not in self.estimator_params: + _kwargs.pop(k) + self.params["featureSubsetStrategy"] = str(self.params["featureSubsetStrategy"]) + _kwargs["labelCol"] = label_col + self._fit(df_train, **_kwargs) + train_time = time.time() - start_time + return train_time + + def _fit(self, df_train: sparkDataFrame, **kwargs): + current_time = time.time() + model = self.estimator_class(**self.params, **kwargs) + if logger.level == logging.DEBUG: + logger.debug(f"flaml.automl.model - {model} fit started with params {self.params}") + self._model = model.fit(df_train) + self._model.classes_ = self.model_classes_ + self._model.n_classes_ = self.model_n_classes_ + if logger.level == logging.DEBUG: + logger.debug(f"flaml.automl.model - {model} fit finished") + train_time = time.time() - current_time + return train_time + + def predict(self, X, index_col="tmp_index_col", return_all=False, **kwargs): + """Predict label from features. + Args: + X: A pyspark or pyspark.pandas dataframe of featurized instances, shape n*m. + index_col: A str of the index column name. Default to "tmp_index_col". + return_all: A bool of whether to return all the prediction results. Default to False. + + Returns: + A pyspark.pandas series of shape n*1 if return_all is False. Otherwise, a pyspark.pandas dataframe. + """ + if self._model is not None: + X = self._preprocess(X, index_col=index_col) + pred = self._model.transform(X) + predictions = to_pandas_on_spark(pred, index_col=index_col) + predictions.index.name = None + pred_y = predictions["prediction"] + if return_all: + return predictions + else: + return pred_y + else: + logger.warning("Estimator is not fit yet. Please run fit() before predict().") + return np.ones(X.shape[0]) + + class TransformersEstimator(BaseEstimator): """The class for fine-tuning language models, using huggingface transformers API.""" @@ -726,13 +859,9 @@ def __init__(self, task="seq-classification", **config): self.trial_id = str(uuid.uuid1().hex)[:8] if task not in NLG_TASKS: # TODO: not in NLG_TASKS - from .nlp.huggingface.training_args import ( - TrainingArgumentsForAuto as TrainingArguments, - ) + from .nlp.huggingface.training_args import TrainingArgumentsForAuto as TrainingArguments else: - from .nlp.huggingface.training_args import ( - Seq2SeqTrainingArgumentsForAuto as TrainingArguments, - ) + from .nlp.huggingface.training_args import Seq2SeqTrainingArgumentsForAuto as TrainingArguments self._TrainingArguments = TrainingArguments @classmethod @@ -887,9 +1016,7 @@ def tokenizer(self): @property def data_collator(self): - from flaml.automl.nlp.huggingface.data_collator import ( - task_to_datacollator_class, - ) + from flaml.automl.nlp.huggingface.data_collator import task_to_datacollator_class from flaml.automl.task.task import Task data_collator_class = task_to_datacollator_class.get( @@ -941,6 +1068,8 @@ def fit( except ImportError: self._use_ray = False + if "is_retrain" in kwargs: + kwargs.pop("is_retrain") this_params = self.params self._kwargs = kwargs @@ -1029,6 +1158,10 @@ def on_epoch_end(self, args, state, control, **callback_kwargs): self.intermediate_results = [ x[1] for x in sorted(self._trainer.intermediate_results.items(), key=lambda x: x[0]) ] + self._model = { + "model": self._trainer.model, + "tokenizer": self.tokenizer, + } self._trainer = None return time.time() - start_time @@ -1346,6 +1479,10 @@ def _preprocess(self, X): return X def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs): + if "is_retrain" in kwargs: + is_retrain = kwargs.pop("is_retrain") + else: + is_retrain = False start_time = time.time() deadline = start_time + budget if budget else np.inf n_iter = self.params.get(self.ITER_HP, self.DEFAULT_ITER) @@ -1353,11 +1490,15 @@ def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs): if not self.HAS_CALLBACK: mem0 = psutil.virtual_memory().available if psutil is not None else 1 if ( - (not self._time_per_iter or abs(self._train_size - X_train.shape[0]) > 4) - and budget is not None - or self._mem_per_iter < 0 - and psutil is not None - ) and n_iter > 1: + ( + (not self._time_per_iter or abs(self._train_size - X_train.shape[0]) > 4) + and budget is not None + or self._mem_per_iter < 0 + and psutil is not None + ) + and n_iter > 1 + and not is_retrain + ): self.params[self.ITER_HP] = 1 self._t1 = self._fit(X_train, y_train, **kwargs) if budget is not None and self._t1 >= budget or n_iter == 1: @@ -1542,6 +1683,8 @@ def __init__( def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs): import xgboost as xgb + if "is_retrain" in kwargs: + kwargs.pop("is_retrain") start_time = time.time() deadline = start_time + budget if budget else np.inf if issparse(X_train): @@ -1591,26 +1734,11 @@ def predict(self, X, **kwargs): @classmethod def _callbacks(cls, start_time, deadline, free_mem_ratio): - try: - from xgboost.callback import TrainingCallback - except ImportError: # for xgboost<1.3 + if xgb_callback: + return [XGBoostResourceLimit(start_time, deadline, free_mem_ratio)] + else: return None - class ResourceLimit(TrainingCallback): - def after_iteration(self, model, epoch, evals_log) -> bool: - now = time.time() - if epoch == 0: - self._time_per_iter = now - start_time - if now + self._time_per_iter > deadline: - return True - if psutil is not None: - mem = psutil.virtual_memory() - if mem.available / mem.total < free_mem_ratio: - return True - return False - - return [ResourceLimit()] - class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator): """The class for tuning XGBoost with unlimited depth, using sklearn API.""" @@ -1658,6 +1786,8 @@ def __init__( self._xgb_version = xgb.__version__ def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs): + if "is_retrain" in kwargs: + kwargs.pop("is_retrain") if issparse(X_train) and self._xgb_version < "1.6.0": # "auto" fails for sparse input since xgboost 1.6.0 self.params["tree_method"] = "auto" @@ -1913,6 +2043,8 @@ def __init__( self.estimator_class = CatBoostRegressor def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs): + if "is_retrain" in kwargs: + kwargs.pop("is_retrain") start_time = time.time() deadline = start_time + budget if budget else np.inf train_dir = f"catboost_{str(start_time)}" @@ -1964,20 +2096,7 @@ def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs): @classmethod def _callbacks(cls, start_time, deadline, free_mem_ratio): - class ResourceLimit: - def after_iteration(self, info) -> bool: - now = time.time() - if info.iteration == 1: - self._time_per_iter = now - start_time - if now + self._time_per_iter > deadline: - return False - if psutil is not None and free_mem_ratio is not None: - mem = psutil.virtual_memory() - if mem.available / mem.total < free_mem_ratio: - return False - return True # can continue - - return [ResourceLimit()] + return [CatBoostResourceLimit(start_time, deadline, free_mem_ratio)] class KNeighborsEstimator(BaseEstimator): @@ -2030,6 +2149,633 @@ def _preprocess(self, X): return X +class SVCEstimator(SKLearnEstimator): + """The class for tuning Linear Support Vector Machine Classifier.""" + + """Reference: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html""" + ITER_HP = "max_iter" + + @classmethod + def search_space(cls, **params): + return { + "C": { + "domain": tune.loguniform(lower=0.03125, upper=32768.0), + "init_value": 1.0, + }, + "penalty": { + "domain": tune.choice(["l1", "l2"]), + "init_value": "l2", + }, + } + + def config2params(self, config: dict) -> dict: + params = super().config2params(config) + params["tol"] = params.get("tol", 0.0001) + if params.get("penalty", "l2") == "l1": + params["dual"] = False + params["loss"] = "squared_hinge" + else: + params["dual"] = False + params["loss"] = params.get("loss", "squared_hinge") + + if "n_jobs" in params: + params.pop("n_jobs") + return params + + def __init__(self, task="binary", **config): + super().__init__(task, **config) + assert self._task.is_classification(), "LinearSVC for classification task only" + self.estimator_class = LinearSVC + + def predict_proba(self, X, **kwargs): + """Predict the probability of each class from features. + + Only works for classification problems + + Args: + X: A numpy array of featurized instances, shape n*m. + + Returns: + A numpy array of shape n*c. c is the # classes. + Each element at (i,j) is the probability for instance i to be in + class j. + """ + assert self._task.is_classification(), "predict_proba() only for classification." + + X = self._preprocess(X) + return self._model._predict_proba_lr(X, **kwargs) + + +class SparkNaiveBayesEstimator(SparkEstimator): + """The class for tuning Naive Bayes Classifier.""" + + """Reference: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.NaiveBayes.html""" + + ITER_HP = "maxIter" + + @classmethod + def search_space(cls, data_size, task, **params): + space = { + "smoothing": { + "domain": tune.loguniform(0.01, 2.0), + "init_value": 1.0, + }, + "modelType": { + # Not using multinomial since it only support binary features + "domain": tune.choice(["multinomial", "gaussian"]), + }, + } + + return space + + def __init__(self, task="binary", **config): + super().__init__(task, **config) + assert self._task.is_classification(), "Naive Bayes for classification task only" + if "verbose" in self.params: + self.params.pop("verbose") + if "n_jobs" in self.params: + self.params.pop("n_jobs") + + from pyspark.ml.classification import NaiveBayes + + self.estimator_class = NaiveBayes + + self._task = task + self._model = None + self._time_per_iter = None + self._train_size = 0 + self._mem_per_iter = -1 + self.model_classes_ = None + self.model_n_classes_ = None + + +class SGDEstimator(SKLearnEstimator): + """The class for tuning Stoachastic Gradient Descent model.""" + + """Reference: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html""" + """Reference: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html""" + + ITER_HP = "max_iter" + + @classmethod + def search_space(cls, task, **params): + if task.is_classification(): + loss_func_space = [ + "log_loss" if SKLEARN_VERSION >= "1.1" else "log", + "modified_huber", + ] + eps_init = 0.1 + power_t_init = 0.5 + else: + loss_func_space = ["squared_error", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"] + eps_init = 0.1 + power_t_init = 0.25 + space = { + "loss": { + "domain": tune.choice(loss_func_space), + }, + "penalty": { + "domain": tune.choice(["l1", "l2", "elasticnet", "None"]), + "init_value": "l2", + }, + "alpha": { + "domain": tune.loguniform(lower=1e-7, upper=1e-1), + "init_value": 0.0001, + }, + "l1_ratio": { + "domain": tune.loguniform(lower=1e-9, upper=1), + "init_value": 0.15, + }, + "epsilon": { + "domain": tune.loguniform(lower=1e-5, upper=1e-1), + "init_value": eps_init, + }, + "learning_rate": { + "domain": tune.choice(["optimal", "invscaling", "constant"]), + "init_value": "invscaling", + }, + "eta0": { + "domain": tune.loguniform(lower=1e-7, upper=1e-1), + "init_value": 0.01, + }, + "power_t": { + "domain": tune.uniform(lower=1e-5, upper=1), + "init_value": power_t_init, + }, + "average": { + "domain": tune.choice([False, True]), + "init_value": False, + }, + } + return space + + def config2params(self, config: dict) -> dict: + params = super().config2params(config) + params["tol"] = params.get("tol", 0.0001) + params["loss"] = params.get("loss", None) + if params["loss"] is None and self._task.is_classification(): + params["loss"] = "log_loss" if SKLEARN_VERSION >= "1.1" else "log" + if not self._task.is_classification(): + params.pop("n_jobs") + + if params.get("penalty") != "elasticnet": + if "l1_ratio" in params: + params.pop("l1_ratio") + + # loss = "modified_huber" -> requires epsilon + if params.get("loss") != "modified_huber": + if "epsilon" in params: + params.pop("epsilon") + + # learning_rate = "invscaling" -> requires power_t + if params.get("learning_rate") != "invscaling": + if "power_t" in params: + params.pop("power_t") + + # learning_rate in ["invscaling", "constant"] -> requires eta0 + if params.get("learning_rate") not in ["invscaling", "constant"]: + if "eta0" in params: + params.pop("eta0") + + return params + + def __init__(self, task="binary", **config): + super().__init__(task, **config) + if self._task.is_classification(): + self.estimator_class = SGDClassifier + elif self._task.is_regression(): + self.estimator_class = SGDRegressor + else: + raise ValueError("SGD only supports classification and regression tasks") + self.normalizer = Normalizer() + + def _fit(self, X_train, y_train, **kwargs): + current_time = time.time() + if "groups" in kwargs: + kwargs = kwargs.copy() + groups = kwargs.pop("groups") + if self._task == "rank": + kwargs["group"] = group_counts(groups) + X_train = self._preprocess(X_train) + params = self.params.copy() + if params.get("penalty") == "None": + params["penalty"] = None + model = self.estimator_class(**params) + if logger.level == logging.DEBUG: + # xgboost 1.6 doesn't display all the params in the model str + logger.debug(f"flaml.automl.model - {model} fit started with params {self.params}") + model.fit(X_train, y_train, **kwargs) + if logger.level == logging.DEBUG: + logger.debug(f"flaml.automl.model - {model} fit finished") + train_time = time.time() - current_time + self._model = model + return train_time + + def predict_proba(self, X, **kwargs): + """Predict the probability of each class from features. + + Only works for classification problems + + Args: + X: A numpy array of featurized instances, shape n*m. + + Returns: + A numpy array of shape n*c. c is the # classes. + Each element at (i,j) is the probability for instance i to be in + class j. + """ + assert self._task.is_classification(), "predict_proba() only for classification." + + X = self._preprocess(X) + return self._model.predict_proba(X) + + def _preprocess(self, X): + X = super()._preprocess(X) + X = self.normalizer.fit_transform(X) + return X + + +class ElasticNetEstimator(SKLearnEstimator): + """The class for tuning Elastic Net regression model.""" + + """Reference: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html""" + + ITER_HP = "max_iter" + + @classmethod + def search_space(cls, **params): + return { + "alpha": { + "domain": tune.loguniform(lower=0.0001, upper=1.0), + "init_value": 0.1, + }, + "l1_ratio": { + "domain": tune.uniform(lower=0.0, upper=1.0), + "init_value": 0.5, + }, + "selection": { + "domain": tune.choice(["cyclic", "random"]), + "init_value": "cyclic", + }, + } + + def config2params(self, config: dict) -> dict: + params = super().config2params(config) + params["tol"] = params.get("tol", 0.0001) + if "n_jobs" in params: + params.pop("n_jobs") + return params + + def __init__(self, task="regression", **config): + super().__init__(task, **config) + assert self._task.is_regression(), "ElasticNet for regression task only" + self.estimator_class = ElasticNet + + +class LassoLarsEstimator(SKLearnEstimator): + """The class for tuning Lasso model fit with Least Angle Regression a.k.a. Lars.""" + + """Reference: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoLars.html""" + + ITER_HP = "max_iter" + + @classmethod + def search_space(cls, task=None, **params): + return { + "alpha": { + "domain": tune.loguniform(lower=1e-4, upper=1.0), + "init_value": 0.1, + }, + "fit_intercept": { + "domain": tune.choice([True, False]), + "init_value": True, + }, + "eps": { + "domain": tune.loguniform(lower=1e-16, upper=1e-4), + "init_value": 2.220446049250313e-16, + }, + } + + def config2params(self, config: dict) -> dict: + params = super().config2params(config) + if "n_jobs" in params: + params.pop("n_jobs") + return params + + def __init__(self, task="regression", **config): + super().__init__(task, **config) + assert self._task.is_regression(), "LassoLars for regression task only" + self.estimator_class = LassoLars + + def predict(self, X, **kwargs): + X = self._preprocess(X) + return self._model.predict(X, **kwargs) + + +class SparkGLREstimator(SparkEstimator): + """The class for tuning Generalized Linear Regression PySpark model.""" + + """Reference: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.regression.GeneralizedLinearRegression.html""" + + ITER_HP = "maxIter" + + @classmethod + def search_space(cls, data_size, task, **params): + rules = { + "gaussian": ["identity", "log", "inverse"], + "binomial": ["logit", "probit", "cloglog"], + "poisson": ["log", "identity", "sqrt"], + "gamma": ["inverse", "identity", "log"], + } + + space = { + "regParam": { + "domain": tune.loguniform(0.01, 1.0), + "init_value": 0.1, + }, + } + + familyLinks = [] + + for family, members in rules.items(): + for member in members: + familyLinks.append({"family": family, "link": member}) + familyLinks.append({"family": "tweedie", "link": None}) + space["familyLinks"] = {"domain": tune.choice(familyLinks), "init_value": familyLinks[0]} + return space + + def config2params(self, config): + config = super().config2params(config) + for k, v in config["familyLinks"].items(): + config[k] = v + del config["familyLinks"] + return config + + def __init__(self, task="binary", **config): + super().__init__(task, **config) + assert self._task.is_regression(), "Generalized Linear Regression for regression task only" + if "verbose" in self.params: + self.params.pop("verbose") + if "n_jobs" in self.params: + self.params.pop("n_jobs") + + from pyspark.ml.regression import GeneralizedLinearRegression + + self.estimator_class = GeneralizedLinearRegression + + self._task = task + self._model = None + self._time_per_iter = None + self._train_size = 0 + self._mem_per_iter = -1 + self.model_classes_ = None + self.model_n_classes_ = None + + +class SparkLinearRegressionEstimator(SparkEstimator): + """The class for tuning Linear Regression PySpark model.""" + + """Reference: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.regression.LinearRegression.html""" + + ITER_HP = "maxIter" + + @classmethod + def search_space(cls, data_size, task, **params): + space = { + "regParam": { + "domain": tune.loguniform(0.01, 1.0), + "init_value": 0.1, + }, + "elasticNetParam": { + "domain": tune.uniform(0.0, 1.0), + "init_value": 0.0, + }, + "fitIntercept": { + "domain": tune.choice([True, False]), + "init_value": True, + }, + "standardization": { + "domain": tune.choice([True, False]), + "init_value": True, + }, + "aggregationDepth": { + "domain": tune.randint(2, 10), + "init_value": 2, + }, + "loss": { + "domain": tune.choice(["squaredError", "huber"]), + "init_value": "squaredError", + }, + "epsilon": { + "domain": tune.uniform(1.0001, 2), + "init_value": 1.35, + }, + } + + return space + + def __init__(self, task="binary", **config): + super().__init__(task, **config) + assert self._task.is_regression(), "Linear Regression for regression task only" + if "verbose" in self.params: + self.params.pop("verbose") + if "n_jobs" in self.params: + self.params.pop("n_jobs") + + from pyspark.ml.regression import LinearRegression + + self.estimator_class = LinearRegression + + self._task = task + self._model = None + self._time_per_iter = None + self._train_size = 0 + self._mem_per_iter = -1 + self.model_classes_ = None + self.model_n_classes_ = None + + def config2params(self, config): + config = super().config2params(config) + if config["loss"] == "huber": + config.pop("elasticNetParam") + return config + + +class SparkLinearSVCEstimator(SparkEstimator): + """The class for tuning Linear SVC PySpark model.""" + + """Reference: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.LinearSVC.html""" + + ITER_HP = "maxIter" + + @classmethod + def search_space(cls, data_size, task, **params): + space = { + "aggregationDepth": { + "domain": tune.randint(2, 10), + "init_value": 2, + }, + "regParam": { + "domain": tune.uniform(0, 1.0), + "init_value": 0, + }, + "fitIntercept": { + "domain": tune.choice([True, False]), + "init_value": True, + }, + "standardization": { + "domain": tune.choice([True, False]), + "init_value": True, + }, + "threshold": { + "domain": tune.uniform(0, 1.0), + "init_value": 0, + }, + } + return space + + def __init__(self, task="binary", **config): + super().__init__(task, **config) + assert self._task.is_binary(), "Linear SVC for binary classification task only" + if "verbose" in self.params: + self.params.pop("verbose") + if "n_jobs" in self.params: + self.params.pop("n_jobs") + from pyspark.ml.classification import LinearSVC + + self.estimator_class = LinearSVC + + +class SparkGBTEstimator(SparkEstimator): + """The class for tuning GBT PySpark model.""" + + """Reference: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.GBTClassifier.html""" + """Reference: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.regression.GBTRegressor.html""" + + ITER_HP = "maxIter" + + @classmethod + def search_space(cls, data_size, task, **params): + space = { + "maxDepth": { + "domain": tune.randint(3, 10), + "init_value": 5, + }, + "maxBins": { + "domain": tune.randint(10, 100), + "init_value": 32, + }, + "stepSize": { + "domain": tune.loguniform(0.01, 1.0), + "init_value": 0.1, + }, + "subsamplingRate": { + "domain": tune.uniform(0.0001, 1.0), + "init_value": 1.0, + }, + "minInstancesPerNode": { + "domain": tune.randint(1, 10), + "init_value": 1, + }, + "minWeightFractionPerNode": { + "domain": tune.uniform(0.0, 0.4999), + "init_value": 0.0, + }, + "minInfoGain": { + "domain": tune.uniform(0.0, 0.1), + "init_value": 0.0, + }, + } + return space + + def __init__(self, task="binary", **config): + super().__init__(task, **config) + assert ( + self._task.is_binary() or self._task.is_regression() + ), "GBT for binary classification task or regression only" + if "verbose" in self.params: + self.params.pop("verbose") + if "n_jobs" in self.params: + self.params.pop("n_jobs") + if self._task.is_binary(): + from pyspark.ml.classification import GBTClassifier + + self.estimator_class = GBTClassifier + else: + from pyspark.ml.regression import GBTRegressor + + self.estimator_class = GBTRegressor + + +class SparkAFTSurvivalRegressionEstimator(SparkEstimator): + """The class for tuning AFTSurvivalRegression PySpark model.""" + + """Reference: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.regression.AFTSurvivalRegression.html""" + + ITER_HP = "maxIter" + + @classmethod + def search_space(cls, data_size, task, **params): + space = { + "fitIntercept": { + "domain": tune.choice([True, False]), + "init_value": True, + }, + "aggregationDepth": { + "domain": tune.randint(2, 10), + "init_value": 2, + }, + } + + return space + + def __init__(self, task="binary", **config): + super().__init__(task, **config) + assert self._task.is_regression(), "AFTSurvivalRegression for regression task only" + if "verbose" in self.params: + self.params.pop("verbose") + if "n_jobs" in self.params: + self.params.pop("n_jobs") + + from pyspark.ml.regression import AFTSurvivalRegression + + self.estimator_class = AFTSurvivalRegression + + +class BaseResourceLimit: + def __init__(self, start_time, deadline, free_mem_ratio): + self.start_time = start_time + self.deadline = deadline + self.free_mem_ratio = free_mem_ratio + self._time_per_iter = None + + def check_resource_limits(self, current_time, current_iteration, mllib): + if (mllib == "xgb" and current_iteration == 0) or (mllib == "cat" and current_iteration == 1): + self._time_per_iter = current_time - self.start_time + if current_time + self._time_per_iter > self.deadline: + return False + if psutil is not None and self.free_mem_ratio is not None: + mem = psutil.virtual_memory() + if mem.available / mem.total < self.free_mem_ratio: + return False + return True + + def after_iteration(self, *args, **kwargs) -> bool: + raise NotImplementedError + + +class XGBoostResourceLimit(BaseResourceLimit, TrainingCallback): + def after_iteration(self, model, epoch, evals_log) -> bool: + now = time.time() + return not self.check_resource_limits(now, epoch, "xgb") + + +class CatBoostResourceLimit(BaseResourceLimit): + def after_iteration(self, info) -> bool: + now = time.time() + return self.check_resource_limits(now, info.iteration, "cat") + + class suppress_stdout_stderr: def __init__(self): # Open a pair of null files diff --git a/flaml/automl/spark/configs.py b/flaml/automl/spark/configs.py deleted file mode 100644 index 26584dc479..0000000000 --- a/flaml/automl/spark/configs.py +++ /dev/null @@ -1,97 +0,0 @@ -ParamList_LightGBM_Base = [ - "baggingFraction", - "baggingFreq", - "baggingSeed", - "binSampleCount", - "boostFromAverage", - "boostingType", - "catSmooth", - "categoricalSlotIndexes", - "categoricalSlotNames", - "catl2", - "chunkSize", - "dataRandomSeed", - "defaultListenPort", - "deterministic", - "driverListenPort", - "dropRate", - "dropSeed", - "earlyStoppingRound", - "executionMode", - "extraSeed" "featureFraction", - "featureFractionByNode", - "featureFractionSeed", - "featuresCol", - "featuresShapCol", - "fobj" "improvementTolerance", - "initScoreCol", - "isEnableSparse", - "isProvideTrainingMetric", - "labelCol", - "lambdaL1", - "lambdaL2", - "leafPredictionCol", - "learningRate", - "matrixType", - "maxBin", - "maxBinByFeature", - "maxCatThreshold", - "maxCatToOnehot", - "maxDeltaStep", - "maxDepth", - "maxDrop", - "metric", - "microBatchSize", - "minDataInLeaf", - "minDataPerBin", - "minDataPerGroup", - "minGainToSplit", - "minSumHessianInLeaf", - "modelString", - "monotoneConstraints", - "monotoneConstraintsMethod", - "monotonePenalty", - "negBaggingFraction", - "numBatches", - "numIterations", - "numLeaves", - "numTasks", - "numThreads", - "objectiveSeed", - "otherRate", - "parallelism", - "passThroughArgs", - "posBaggingFraction", - "predictDisableShapeCheck", - "predictionCol", - "repartitionByGroupingColumn", - "seed", - "skipDrop", - "slotNames", - "timeout", - "topK", - "topRate", - "uniformDrop", - "useBarrierExecutionMode", - "useMissing", - "useSingleDatasetMode", - "validationIndicatorCol", - "verbosity", - "weightCol", - "xGBoostDartMode", - "zeroAsMissing", - "objective", -] -ParamList_LightGBM_Classifier = ParamList_LightGBM_Base + [ - "isUnbalance", - "probabilityCol", - "rawPredictionCol", - "thresholds", -] -ParamList_LightGBM_Regressor = ParamList_LightGBM_Base + ["tweedieVariancePower"] -ParamList_LightGBM_Ranker = ParamList_LightGBM_Base + [ - "groupCol", - "evalAt", - "labelGain", - "maxPosition", -] diff --git a/flaml/automl/state.py b/flaml/automl/state.py index f966111696..a5897f7234 100644 --- a/flaml/automl/state.py +++ b/flaml/automl/state.py @@ -65,6 +65,7 @@ def __init__( custom_hp=None, max_iter=None, budget=None, + featurization="auto", ): self.init_eci = learner_class.cost_relative2lgbm() if budget >= 0 else 1 self._search_space_domain = {} @@ -82,6 +83,7 @@ def __init__( else: data_size = data.shape search_space = learner_class.search_space(data_size=data_size, task=task) + self.data_size = data_size if custom_hp is not None: @@ -288,9 +290,11 @@ def _compute_with_config_base( budget = ( None if state.time_budget < 0 - else state.time_budget - state.time_from_start - if sample_size == state.data_size[0] - else (state.time_budget - state.time_from_start) / 2 * sample_size / state.data_size[0] + else ( + state.time_budget - state.time_from_start + if sample_size == state.data_size[0] + else (state.time_budget - state.time_from_start) / 2 * sample_size / state.data_size[0] + ) ) ( @@ -351,6 +355,7 @@ def _train_with_config( estimator: str, config_w_resource: dict, sample_size: Optional[int] = None, + is_retrain: bool = False, ): if not sample_size: sample_size = config_w_resource.get("FLAML_sample_size", len(self.y_train_all)) @@ -376,9 +381,8 @@ def _train_with_config( this_estimator_kwargs[ "groups" ] = groups # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator - + this_estimator_kwargs.update({"is_retrain": is_retrain}) budget = None if self.time_budget < 0 else self.time_budget - self.time_from_start - estimator, train_time = train_estimator( X_train=sampled_X_train, y_train=sampled_y_train, diff --git a/flaml/automl/task/generic_task.py b/flaml/automl/task/generic_task.py index 8d7b4defdd..df61d7e664 100644 --- a/flaml/automl/task/generic_task.py +++ b/flaml/automl/task/generic_task.py @@ -16,12 +16,7 @@ unique_pandas_on_spark, unique_value_first_index, ) -from flaml.automl.task.task import ( - TS_FORECAST, - TS_FORECASTPANEL, - Task, - get_classification_objective, -) +from flaml.automl.task.task import TS_FORECAST, TS_FORECASTPANEL, Task, get_classification_objective from flaml.config import RANDOM_SEED try: @@ -53,13 +48,24 @@ def estimators(self): from flaml.automl.contrib.histgb import HistGradientBoostingEstimator from flaml.automl.model import ( CatBoostEstimator, + ElasticNetEstimator, ExtraTreesEstimator, KNeighborsEstimator, + LassoLarsEstimator, LGBMEstimator, LRL1Classifier, LRL2Classifier, RandomForestEstimator, + SGDEstimator, + SparkAFTSurvivalRegressionEstimator, + SparkGBTEstimator, + SparkGLREstimator, SparkLGBMEstimator, + SparkLinearRegressionEstimator, + SparkLinearSVCEstimator, + SparkNaiveBayesEstimator, + SparkRandomForestEstimator, + SVCEstimator, TransformersEstimator, TransformersEstimatorModelSelection, XGBoostLimitDepthEstimator, @@ -72,6 +78,7 @@ def estimators(self): "rf": RandomForestEstimator, "lgbm": LGBMEstimator, "lgbm_spark": SparkLGBMEstimator, + "rf_spark": SparkRandomForestEstimator, "lrl1": LRL1Classifier, "lrl2": LRL2Classifier, "catboost": CatBoostEstimator, @@ -80,6 +87,17 @@ def estimators(self): "transformer": TransformersEstimator, "transformer_ms": TransformersEstimatorModelSelection, "histgb": HistGradientBoostingEstimator, + # Above are open-source, below are internal + "svc": SVCEstimator, + "sgd": SGDEstimator, + "nb_spark": SparkNaiveBayesEstimator, + "enet": ElasticNetEstimator, + "lassolars": LassoLarsEstimator, + "glr_spark": SparkGLREstimator, + "lr_spark": SparkLinearRegressionEstimator, + "svc_spark": SparkLinearSVCEstimator, + "gbt_spark": SparkGBTEstimator, + "aft_spark": SparkAFTSurvivalRegressionEstimator, } return self._estimators @@ -271,8 +289,8 @@ def _split_pyspark(state, X_train_all, y_train_all, split_ratio, stratify=None): seed=RANDOM_SEED, ) columns_to_drop = [c for c in df_all_train.columns if c in [stratify_column, "sample_weight"]] - X_train = df_all_train.drop(columns_to_drop) - X_val = df_all_val.drop(columns_to_drop) + X_train = df_all_train.drop(columns=columns_to_drop) + X_val = df_all_val.drop(columns=columns_to_drop) y_train = df_all_train[stratify_column] y_val = df_all_val[stratify_column] @@ -497,14 +515,37 @@ def prepare_data( last = first[i] + 1 rest.extend(range(last, len(y_train_all))) X_first = X_train_all.iloc[first] if data_is_df else X_train_all[first] - X_rest = X_train_all.iloc[rest] if data_is_df else X_train_all[rest] - y_rest = ( - y_train_all[rest] - if isinstance(y_train_all, np.ndarray) - else iloc_pandas_on_spark(y_train_all, rest) - if is_spark_dataframe - else y_train_all.iloc[rest] - ) + if len(first) < len(y_train_all) / 2: + # Get X_rest and y_rest with drop, sparse matrix can't apply np.delete + X_rest = ( + np.delete(X_train_all, first, axis=0) + if isinstance(X_train_all, np.ndarray) + else X_train_all.drop(first.tolist()) + if data_is_df + else X_train_all[rest] + ) + y_rest = ( + np.delete(y_train_all, first, axis=0) + if isinstance(y_train_all, np.ndarray) + else y_train_all.drop(first.tolist()) + if data_is_df + else y_train_all[rest] + ) + else: + X_rest = ( + iloc_pandas_on_spark(X_train_all, rest) + if is_spark_dataframe + else X_train_all.iloc[rest] + if data_is_df + else X_train_all[rest] + ) + y_rest = ( + iloc_pandas_on_spark(y_train_all, rest) + if is_spark_dataframe + else y_train_all.iloc[rest] + if data_is_df + else y_train_all[rest] + ) stratify = y_rest if split_type == "stratified" else None X_train, X_val, y_train, y_val = self._train_test_split( state, X_rest, y_rest, first, rest, split_ratio, stratify @@ -513,6 +554,12 @@ def prepare_data( y_train = concat(label_set, y_train) if data_is_df else np.concatenate([label_set, y_train]) X_val = concat(X_first, X_val) y_val = concat(label_set, y_val) if data_is_df else np.concatenate([label_set, y_val]) + + if isinstance(y_train, (psDataFrame, pd.DataFrame)) and y_train.shape[1] == 1: + y_train = y_train[y_train.columns[0]] + y_val = y_val[y_val.columns[0]] + y_train.name = y_val.name = y_rest.name + elif self.is_regression(): X_train, X_val, y_train, y_val = self._train_test_split( state, X_train_all, y_train_all, split_ratio=split_ratio @@ -810,27 +857,23 @@ def default_estimator_list(self, estimator_list: List[str], is_spark_dataframe: elif self.is_ts_forecastpanel(): estimator_list = ["tft"] else: + estimator_list = [ + "lgbm", + "rf", + "xgboost", + "extra_tree", + "xgb_limitdepth", + "lgbm_spark", + "rf_spark", + "sgd", + ] try: import catboost - estimator_list = [ - "lgbm", - "rf", - "catboost", - "xgboost", - "extra_tree", - "xgb_limitdepth", - "lgbm_spark", - ] + estimator_list += ["catboost"] except ImportError: - estimator_list = [ - "lgbm", - "rf", - "xgboost", - "extra_tree", - "xgb_limitdepth", - "lgbm_spark", - ] + pass + # if self.is_ts_forecast(): # # catboost is removed because it has a `name` parameter, making it incompatible with hcrystalball # if "catboost" in estimator_list: @@ -862,9 +905,7 @@ def default_metric(self, metric: str) -> str: return metric if self.is_nlp(): - from flaml.automl.nlp.utils import ( - load_default_huggingface_metric_for_task, - ) + from flaml.automl.nlp.utils import load_default_huggingface_metric_for_task return load_default_huggingface_metric_for_task(self.name) elif self.is_binary(): diff --git a/flaml/automl/task/time_series_task.py b/flaml/automl/task/time_series_task.py index 7dc9f84a22..15eac2a8e8 100644 --- a/flaml/automl/task/time_series_task.py +++ b/flaml/automl/task/time_series_task.py @@ -36,11 +36,17 @@ def estimators(self): LGBM_TS, RF_TS, SARIMAX, + Average, CatBoost_TS, ExtraTrees_TS, HoltWinters, + LassoLars_TS, + Naive, Orbit, Prophet, + SeasonalAverage, + SeasonalNaive, + TCNEstimator, TemporalFusionTransformerEstimator, XGBoost_TS, XGBoostLimitDepth_TS, @@ -57,8 +63,19 @@ def estimators(self): "holt-winters": HoltWinters, "catboost": CatBoost_TS, "tft": TemporalFusionTransformerEstimator, + "lassolars": LassoLars_TS, + "tcn": TCNEstimator, + "snaive": SeasonalNaive, + "naive": Naive, + "savg": SeasonalAverage, + "avg": Average, } + if self._estimators["tcn"] is None: + # remove TCN if import failed + del self._estimators["tcn"] + logger.info("Couldn't import pytorch_lightning, skipping TCN estimator") + try: from prophet import Prophet as foo @@ -71,7 +88,7 @@ def estimators(self): self._estimators["orbit"] = Orbit except ImportError: - logger.info("Couldn't import Prophet, skipping") + logger.info("Couldn't import orbit, skipping") return self._estimators diff --git a/flaml/automl/time_series/__init__.py b/flaml/automl/time_series/__init__.py index b48f266161..76a3087588 100644 --- a/flaml/automl/time_series/__init__.py +++ b/flaml/automl/time_series/__init__.py @@ -1,16 +1,27 @@ from .tft import TemporalFusionTransformerEstimator -from .ts_data import TimeSeriesDataset from .ts_model import ( ARIMA, LGBM_TS, RF_TS, SARIMAX, + Average, CatBoost_TS, ExtraTrees_TS, HoltWinters, + LassoLars_TS, + Naive, Orbit, Prophet, + SeasonalAverage, + SeasonalNaive, TimeSeriesEstimator, XGBoost_TS, XGBoostLimitDepth_TS, ) + +try: + from .tcn import TCNEstimator +except ImportError: + TCNEstimator = None + +from .ts_data import TimeSeriesDataset diff --git a/flaml/automl/time_series/tcn.py b/flaml/automl/time_series/tcn.py new file mode 100644 index 0000000000..cfd04d78f6 --- /dev/null +++ b/flaml/automl/time_series/tcn.py @@ -0,0 +1,285 @@ +# This file is adapted from +# https://github.com/locuslab/TCN/blob/master/TCN/tcn.py +# https://github.com/locuslab/TCN/blob/master/TCN/adding_problem/add_test.py + +import datetime +import logging +import time + +import pandas as pd +import pytorch_lightning as pl +import torch +import torch.nn as nn +import torch.optim as optim +from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor +from pytorch_lightning.loggers import TensorBoardLogger +from torch.nn.utils import weight_norm +from torch.utils.data import DataLoader, TensorDataset + +from flaml import tune +from flaml.automl.data import add_time_idx_col +from flaml.automl.logger import logger, logger_formatter +from flaml.automl.time_series.ts_data import TimeSeriesDataset +from flaml.automl.time_series.ts_model import TimeSeriesEstimator + + +class Chomp1d(nn.Module): + def __init__(self, chomp_size): + super().__init__() + self.chomp_size = chomp_size + + def forward(self, x): + return x[:, :, : -self.chomp_size].contiguous() + + +class TemporalBlock(nn.Module): + def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2): + super().__init__() + self.conv1 = weight_norm( + nn.Conv1d(n_inputs, n_outputs, kernel_size, stride=stride, padding=padding, dilation=dilation) + ) + self.chomp1 = Chomp1d(padding) + self.relu1 = nn.ReLU() + self.dropout1 = nn.Dropout(dropout) + + self.conv2 = weight_norm( + nn.Conv1d(n_outputs, n_outputs, kernel_size, stride=stride, padding=padding, dilation=dilation) + ) + self.chomp2 = Chomp1d(padding) + self.relu2 = nn.ReLU() + self.dropout2 = nn.Dropout(dropout) + + self.net = nn.Sequential( + self.conv1, self.chomp1, self.relu1, self.dropout1, self.conv2, self.chomp2, self.relu2, self.dropout2 + ) + self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None + self.relu = nn.ReLU() + self.init_weights() + + def init_weights(self): + self.conv1.weight.data.normal_(0, 0.01) + self.conv2.weight.data.normal_(0, 0.01) + if self.downsample is not None: + self.downsample.weight.data.normal_(0, 0.01) + + def forward(self, x): + out = self.net(x) + res = x if self.downsample is None else self.downsample(x) + return self.relu(out + res) + + +class TCNForecaster(nn.Module): + def __init__( + self, + input_feature_num, + num_outputs, + num_channels, + kernel_size=2, + dropout=0.2, + ): + super().__init__() + layers = [] + num_levels = len(num_channels) + for i in range(num_levels): + dilation_size = 2**i + in_channels = input_feature_num if i == 0 else num_channels[i - 1] + out_channels = num_channels[i] + layers += [ + TemporalBlock( + in_channels, + out_channels, + kernel_size, + stride=1, + dilation=dilation_size, + padding=(kernel_size - 1) * dilation_size, + dropout=dropout, + ) + ] + + self.network = nn.Sequential(*layers) + self.linear = nn.Linear(num_channels[-1], num_outputs) + + def forward(self, x): + y1 = self.network(x) + return self.linear(y1[:, :, -1]) + + +class TCNForecasterLightningModule(pl.LightningModule): + def __init__(self, model: TCNForecaster, learning_rate: float = 1e-3): + super().__init__() + self.model = model + self.learning_rate = learning_rate + self.loss_fn = nn.MSELoss() + + def forward(self, x): + return self.model(x) + + def step(self, batch, batch_idx): + x, y = batch + y_hat = self.model(x) + loss = self.loss_fn(y_hat, y) + return loss + + def training_step(self, batch, batch_idx): + loss = self.step(batch, batch_idx) + self.log("train_loss", loss) + return loss + + def validation_step(self, batch, batch_idx): + loss = self.step(batch, batch_idx) + self.log("val_loss", loss) + return loss + + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=self.learning_rate) + + +class DataframeDataset(torch.utils.data.Dataset): + def __init__(self, dataframe, target_column, features_columns, sequence_length, train=True): + self.data = torch.tensor(dataframe[features_columns].to_numpy(), dtype=torch.float) + self.sequence_length = sequence_length + if train: + self.labels = torch.tensor(dataframe[target_column].to_numpy(), dtype=torch.float) + self.is_train = train + + def __len__(self): + return len(self.data) - self.sequence_length + 1 + + def __getitem__(self, idx): + data = self.data[idx : idx + self.sequence_length] + data = data.permute(1, 0) + if self.is_train: + label = self.labels[idx : idx + self.sequence_length] + return data, label + else: + return data + + +class TCNEstimator(TimeSeriesEstimator): + """The class for tuning TCN Forecaster""" + + @classmethod + def search_space(cls, data, task, pred_horizon, **params): + space = { + "num_levels": { + "domain": tune.randint(lower=4, upper=20), # hidden = 2^num_hidden + "init_value": 4, + }, + "num_hidden": { + "domain": tune.randint(lower=4, upper=8), # hidden = 2^num_hidden + "init_value": 5, + }, + "kernel_size": { + "domain": tune.choice([2, 3, 5, 7]), # common choices for kernel size + "init_value": 3, + }, + "dropout": { + "domain": tune.uniform(lower=0.0, upper=0.5), # standard range for dropout + "init_value": 0.1, + }, + "learning_rate": { + "domain": tune.loguniform(lower=1e-4, upper=1e-1), # typical range for learning rate + "init_value": 1e-3, + }, + } + return space + + def __init__(self, task="ts_forecast", n_jobs=1, **params): + super().__init__(task, **params) + logging.getLogger("pytorch_lightning").setLevel(logging.WARNING) + + def fit(self, X_train: TimeSeriesDataset, y_train=None, budget=None, **kwargs): + start_time = time.time() + if budget is not None: + deltabudget = datetime.timedelta(seconds=budget) + else: + deltabudget = None + X_train = self.enrich(X_train) + super().fit(X_train, y_train, budget, **kwargs) + + self.batch_size = kwargs.get("batch_size", 64) + self.horizon = kwargs.get("period", 1) + self.feature_cols = X_train.time_varying_known_reals + self.target_col = X_train.target_names[0] + + train_dataset = DataframeDataset( + X_train.train_data, + self.target_col, + self.feature_cols, + self.horizon, + ) + train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=False) + if not X_train.test_data.empty: + val_dataset = DataframeDataset( + X_train.test_data, + self.target_col, + self.feature_cols, + self.horizon, + ) + else: + val_dataset = DataframeDataset( + X_train.train_data.sample(frac=0.2, random_state=kwargs.get("random_state", 0)), + self.target_col, + self.feature_cols, + self.horizon, + ) + + val_loader = DataLoader(val_dataset, batch_size=self.batch_size, shuffle=False) + + model = TCNForecaster( + len(self.feature_cols), + self.horizon, + [2 ** self.params["num_hidden"]] * self.params["num_levels"], + self.params["kernel_size"], + self.params["dropout"], + ) + + pl_module = TCNForecasterLightningModule(model, self.params["learning_rate"]) + + # Training loop + # gpus is deprecated in v1.7 and removed in v2.0 + # accelerator="auto" can cast all condition. + trainer = pl.Trainer( + max_epochs=kwargs.get("max_epochs", 10), + accelerator="auto", + callbacks=[ + EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min"), + LearningRateMonitor(), + ], + logger=TensorBoardLogger(kwargs.get("log_dir", "logs/lightning_logs")), # logging results to a tensorboard + max_time=deltabudget, + enable_model_summary=False, + enable_progress_bar=False, + ) + trainer.fit( + pl_module, + train_dataloaders=train_loader, + val_dataloaders=val_loader, + ) + best_model = trainer.model + self._model = best_model + train_time = time.time() - start_time + return train_time + + def predict(self, X): + X = self.enrich(X) + if isinstance(X, TimeSeriesDataset): + df = X.X_val + else: + df = X + dataset = DataframeDataset( + df, + self.target_col, + self.feature_cols, + self.horizon, + train=False, + ) + data_loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False) + self._model.eval() + raw_preds = [] + for batch_x in data_loader: + raw_pred = self._model(batch_x) + raw_preds.append(raw_pred) + raw_preds = torch.cat(raw_preds, dim=0) + preds = pd.Series(raw_preds.detach().numpy().ravel()) + return preds diff --git a/flaml/automl/time_series/ts_model.py b/flaml/automl/time_series/ts_model.py index 1b581c6a7c..c0a8fe33fc 100644 --- a/flaml/automl/time_series/ts_model.py +++ b/flaml/automl/time_series/ts_model.py @@ -26,6 +26,7 @@ class PD: from flaml.automl.model import ( CatBoostEstimator, ExtraTreesEstimator, + LassoLarsEstimator, LGBMEstimator, RandomForestEstimator, SKLearnEstimator, @@ -631,6 +632,125 @@ def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs): return train_time +class SimpleForecaster(StatsModelsEstimator): + """Base class for Naive Forecaster like Seasonal Naive, Naive, Seasonal Average, Average""" + + @classmethod + def _search_space(cls, data: TimeSeriesDataset, task: Task, pred_horizon: int, **params): + return { + "season": { + "domain": tune.randint(1, pred_horizon), + "init_value": pred_horizon, + } + } + + def joint_preprocess(self, X_train, y_train=None): + X_train = self.enrich(X_train) + + self.regressors = [] + + if isinstance(X_train, TimeSeriesDataset): + data = X_train + target_col = data.target_names[0] + # this class only supports univariate regression + train_df = data.train_data[self.regressors + [target_col]] + train_df.index = to_datetime(data.train_data[data.time_col]) + else: + target_col = TS_VALUE_COL + train_df = self._join(X_train, y_train) + + self.time_col = data.time_col + self.target_names = data.target_names + + train_df = self._preprocess(train_df) + return train_df, target_col + + def fit(self, X_train, y_train=None, budget=None, **kwargs): + import warnings + + warnings.filterwarnings("ignore") + from statsmodels.tsa.holtwinters import SimpleExpSmoothing + + self.season = self.params.get("season", 1) + current_time = time.time() + super().fit(X_train, y_train, budget=budget, **kwargs) + + train_df, target_col = self.joint_preprocess(X_train, y_train) + + model = SimpleExpSmoothing( + train_df[[target_col]], + ) + with suppress_stdout_stderr(): + model = model.fit(smoothing_level=self.smoothing_level) + train_time = time.time() - current_time + self._model = model + return train_time + + +class SeasonalNaive(SimpleForecaster): + smoothing_level = 1.0 + + def predict(self, X, **kwargs): + if isinstance(X, int): + forecasts = [] + for i in range(X): + forecast = self._model.forecast(steps=self.season)[0] + forecasts.append(forecast) + return pd.Series(forecasts) + else: + return super().predict(X, **kwargs) + + +class Naive(SimpleForecaster): + smoothing_level = 0.0 + + @classmethod + def _search_space(cls, data: TimeSeriesDataset, task: Task, pred_horizon: int, **params): + return {} + + def predict(self, X, **kwargs): + if isinstance(X, int): + last_observation = self._model.params["initial_level"] + return pd.Series([last_observation] * X) + else: + return super().predict(X, **kwargs) + + +class SeasonalAverage(SimpleForecaster): + def fit(self, X_train, y_train=None, budget=None, **kwargs): + from statsmodels.tsa.ar_model import AutoReg, ar_select_order + + start_time = time.time() + + self.season = kwargs.get("season", 1) # seasonality period + train_df, target_col = self.joint_preprocess(X_train, y_train) + selection_res = ar_select_order(train_df[target_col], maxlag=self.season) + + # Fit autoregressive model with optimal order + model = AutoReg(train_df[target_col], lags=selection_res.ar_lags) + self._model = model.fit() + end_time = time.time() + + return end_time - start_time + + +class Average(SimpleForecaster): + @classmethod + def _search_space(cls, data: TimeSeriesDataset, task: Task, pred_horizon: int, **params): + return {} + + def fit(self, X_train, y_train=None, budget=None, **kwargs): + from statsmodels.tsa.ar_model import AutoReg + + start_time = time.time() + train_df, target_col = self.joint_preprocess(X_train, y_train) + model = AutoReg(train_df[target_col], lags=0) + self._model = model.fit() + end_time = time.time() + + return end_time - start_time + + class TS_SKLearn(TimeSeriesEstimator): """The class for tuning SKLearn Regressors for time-series forecasting""" @@ -757,3 +877,7 @@ class XGBoostLimitDepth_TS(TS_SKLearn): # catboost regressor is invalid because it has a `name` parameter, making it incompatible with hcrystalball class CatBoost_TS(TS_SKLearn): base_class = CatBoostEstimator + + +class LassoLars_TS(TS_SKLearn): + base_class = LassoLarsEstimator diff --git a/flaml/fabric/__init__.py b/flaml/fabric/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/flaml/fabric/mlflow.py b/flaml/fabric/mlflow.py new file mode 100644 index 0000000000..5eebefa961 --- /dev/null +++ b/flaml/fabric/mlflow.py @@ -0,0 +1,689 @@ +import json +import os +import pickle +import random +import sys +import time +from typing import MutableMapping + +import mlflow +import pandas as pd +from mlflow.entities import Metric, Param, RunTag +from mlflow.exceptions import MlflowException +from mlflow.utils.autologging_utils import AUTOLOGGING_INTEGRATIONS, autologging_is_disabled +from scipy.sparse import issparse +from sklearn import tree + +try: + from pyspark.ml import Pipeline as SparkPipeline +except ImportError: + + class SparkPipeline: + pass + + +# from mlflow.store.tracking import SEARCH_MAX_RESULTS_THRESHOLD +from sklearn.pipeline import Pipeline + +from flaml.automl.logger import logger +from flaml.automl.spark import DataFrame, Series, psDataFrame, psSeries +from flaml.version import __version__ + +SEARCH_MAX_RESULTS = 5000 # Each train should not have more than 5000 trials +IS_RENAME_CHILD_RUN = os.environ.get("FLAML_IS_RENAME_CHILD_RUN", "false").lower() == "true" + + +def flatten_dict(d: MutableMapping, sep: str = ".") -> MutableMapping: + if len(d) == 0: + return d + [flat_dict] = pd.json_normalize(d, sep=sep).to_dict(orient="records") + keys = list(flat_dict.keys()) + for key in keys: + if not isinstance(flat_dict[key], (int, float)): + flat_dict.pop(key) + return flat_dict + + +def is_autolog_enabled(): + return not all(autologging_is_disabled(k) for k in AUTOLOGGING_INTEGRATIONS.keys()) + + +def get_mlflow_log_latency(model_history=False): + st = time.time() + with mlflow.start_run(nested=True, run_name="get_mlflow_log_latency") as run: + if model_history: + sk_model = tree.DecisionTreeClassifier() + mlflow.sklearn.log_model(sk_model, "sk_models") + mlflow.sklearn.log_model(Pipeline([("estimator", sk_model)]), "sk_pipeline") + pickle_fpath = f"tmp_{int(time.time()*1000)}" + with open(pickle_fpath, "wb") as f: + pickle.dump(sk_model, f) + mlflow.log_artifact(pickle_fpath, "sk_model1") + mlflow.log_artifact(pickle_fpath, "sk_model2") + os.remove(pickle_fpath) + mlflow.set_tag("synapseml.ui.visible", "false") # not shown inline in fabric + mlflow.delete_run(run.info.run_id) + et = time.time() + return et - st + + +def infer_signature(X_train=None, y_train=None, dataframe=None, label=None): + if X_train is not None: + if issparse(X_train): + X_train = X_train.tocsr() + elif isinstance(X_train, psDataFrame): + X_train = X_train.to_spark(index_col="tmp_index_col") + y_train = None + try: + signature = mlflow.models.infer_signature(X_train, y_train) + return signature + except (TypeError, MlflowException, Exception) as e: + logger.debug( + f"Failed to infer signature from X_train {type(X_train)} and y_train {type(y_train)}, error: {e}" + ) + else: + if dataframe is not None and label is not None: + X = dataframe.drop(columns=label) + y = dataframe[label] + if isinstance(dataframe, psDataFrame): + X = X.to_spark(index_col="tmp_index_col") + y = None + try: + signature = mlflow.models.infer_signature(X, y) + return signature + except (TypeError, MlflowException, Exception) as e: + logger.debug( + f"Failed to infer signature from dataframe {type(dataframe)} and label {label}, error: {e}" + ) + + +def _mlflow_wrapper(evaluation_func, mlflow_exp_id, mlflow_config=None, extra_tags=None, autolog=False): + def wrapped(*args, **kwargs): + if mlflow_config is not None: + from synapse.ml.mlflow import set_mlflow_env_config + + set_mlflow_env_config(mlflow_config) + import mlflow + + if mlflow_exp_id is not None: + mlflow.set_experiment(experiment_id=mlflow_exp_id) + if autolog: + if mlflow.__version__ > "2.5.0" and extra_tags is not None: + mlflow.autolog(silent=True, extra_tags=extra_tags) + else: + mlflow.autolog(silent=True) + logger.debug("activated mlflow autologging on executor") + else: + mlflow.autolog(disable=True, silent=True) + # with mlflow.start_run(nested=True): + result = evaluation_func(*args, **kwargs) + return result + + return wrapped + + +def _get_notebook_name(): + return None + + +class MLflowIntegration: + def __init__(self, experiment_type="automl", mlflow_exp_name=None, extra_tag=None): + try: + from synapse.ml.mlflow import get_mlflow_env_config + + self.driver_mlflow_env_config = get_mlflow_env_config() + self._on_internal = True + self._notebook_name = _get_notebook_name() + except ModuleNotFoundError: + self.driver_mlflow_env_config = None + self._on_internal = False + self._notebook_name = None + + self.autolog = False + self.manual_log = False + self.parent_run_id = None + self.parent_run_name = None + self.log_type = "null" + self.resume_params = {} + self.train_func = None + self.best_iteration = None + self.best_run_id = None + self.child_counter = 0 + self.infos = [] + self.manual_run_ids = [] + self.has_summary = False + self.has_model = False + self.only_history = False + self._do_log_model = True + + self.extra_tag = ( + extra_tag + if extra_tag is not None + else {"extra_tag.sid": f"flaml_{__version__}_{int(time.time())}_{random.randint(1001, 9999)}"} + ) + self.start_time = time.time() + self.mlflow_client = mlflow.tracking.MlflowClient() + parent_run_info = mlflow.active_run().info if mlflow.active_run() is not None else None + if parent_run_info: + self.experiment_id = parent_run_info.experiment_id + self.parent_run_id = parent_run_info.run_id + # attribute run_name is not available before mlflow 2.0.1 + self.parent_run_name = parent_run_info.run_name if hasattr(parent_run_info, "run_name") else "flaml_run" + if self.parent_run_name == "": + self.parent_run_name = mlflow.active_run().data.tags["mlflow.runName"] + else: + if mlflow_exp_name is None: + if mlflow.tracking.fluent._active_experiment_id is None: + mlflow_exp_name = self._notebook_name if self._notebook_name else "flaml_default_experiment" + mlflow.set_experiment(experiment_name=mlflow_exp_name) + else: + mlflow.set_experiment(experiment_name=mlflow_exp_name) + self.experiment_id = mlflow.tracking.fluent._active_experiment_id + self.experiment_name = mlflow.get_experiment(self.experiment_id).name + self.experiment_type = experiment_type + self.update_autolog_state() + + if self.autolog: + # only end user created parent run in autolog scenario + mlflow.end_run() + + def set_mlflow_config(self): + if self.driver_mlflow_env_config is not None: + from synapse.ml.mlflow import set_mlflow_env_config + + set_mlflow_env_config(self.driver_mlflow_env_config) + + def wrap_evaluation_function(self, evaluation_function): + wrapped_evaluation_function = _mlflow_wrapper( + evaluation_function, self.experiment_id, self.driver_mlflow_env_config, self.extra_tag, self.autolog + ) + return wrapped_evaluation_function + + def set_best_iter(self, result): + # result: AutoML or ExperimentAnalysis + try: + self.best_iteration = result.best_iteration + except AttributeError: + self.best_iteration = None + + def update_autolog_state( + self, + ): + # Currently we disable autologging for better control in AutoML + _autolog = is_autolog_enabled() + self._do_log_model = AUTOLOGGING_INTEGRATIONS["mlflow"].get("log_models", True) + if self.experiment_type == "automl": + self.autolog = False + self.manual_log = mlflow.active_run() is not None or _autolog + self.log_type = "manual" + if _autolog: + logger.debug("Disabling autologging") + self.resume_params = AUTOLOGGING_INTEGRATIONS["mlflow"].copy() + mlflow.autolog(disable=True, silent=True, log_models=self._do_log_model) + self.log_type = "r_autolog" # 'r' for replace autolog with manual log + + elif self.experiment_type == "tune": + self.autolog = _autolog + self.manual_log = not self.autolog and mlflow.active_run() is not None + + if self.autolog: + self.log_type = "autolog" + + if self.manual_log: + self.log_type = "manual" + else: + raise ValueError(f"Unknown experiment type: {self.experiment_type}") + + def copy_mlflow_run(self, src_id, target_id, components=["param", "metric", "tag"]): + src_run = self.mlflow_client.get_run(src_id) + if "param" in components: + for param_name, param_value in src_run.data.params.items(): + try: + self.mlflow_client.log_param(target_id, param_name, param_value) + except mlflow.exceptions.MlflowException: + pass + + timestamp = int(time.time() * 1000) + + if "metric" in components: + _metrics = [Metric(key, value, timestamp, 0) for key, value in src_run.data.metrics.items()] + else: + _metrics = [] + + if "tag" in components: + _tags = [ + RunTag(key, str(value)) + for key, value in src_run.data.tags.items() + if key.startswith("flaml") or key.startswith("synapseml") + ] + else: + _tags = [] + self.mlflow_client.log_batch(run_id=target_id, metrics=_metrics, params=[], tags=_tags) + + def record_trial(self, result, trial, metric): + if isinstance(result, dict): + metrics = flatten_dict(result) + metric_name = str(list(metrics.keys())) + else: + metrics = {metric: result} + metric_name = metric + + if "ml" in trial.config.keys(): + params = trial.config["ml"] + else: + params = trial.config + + info = { + "metrics": metrics, + "params": params, + "tags": { + "flaml.best_run": False, + "flaml.iteration_number": self.child_counter, + "flaml.version": __version__, + "flaml.meric": metric_name, + "flaml.run_source": "flaml-tune", + "flaml.log_type": self.log_type, + }, + "submetrics": { + "values": [], + }, + } + + self.infos.append(info) + + if not self.autolog and not self.manual_log: + return + + if self.manual_log: + with mlflow.start_run( + nested=True, run_name=f"{self.parent_run_name}_child_{self.child_counter}" + ) as child_run: + self._log_info_to_run(info, child_run.info.run_id, log_params=True) + self.manual_run_ids.append(child_run.info.run_id) + self.child_counter += 1 + + def log_tune(self, analysis, metric): + self.set_best_iter(analysis) + if self.autolog: + if self.parent_run_id is not None: + mlflow.start_run(run_id=self.parent_run_id, experiment_id=self.experiment_id) + mlflow.log_metric("num_child_runs", len(self.infos)) + self.adopt_children(analysis) + + if self.manual_log: + if "ml" in analysis.best_config.keys(): + mlflow.log_params(analysis.best_config["ml"]) + else: + mlflow.log_params(analysis.best_config) + mlflow.log_metric("best_" + metric, analysis.best_result[metric]) + best_mlflow_run_id = self.manual_run_ids[analysis.best_iteration] + best_mlflow_run_name = self.mlflow_client.get_run(best_mlflow_run_id).info.run_name + analysis.best_run_id = best_mlflow_run_id + analysis.best_run_name = best_mlflow_run_name + self.mlflow_client.set_tag(best_mlflow_run_id, "flaml.best_run", True) + self.best_run_id = best_mlflow_run_id + if not self.has_summary: + self.copy_mlflow_run(best_mlflow_run_id, self.parent_run_id) + self.has_summary = True + + def log_model(self, model, estimator, signature=None): + if not self._do_log_model: + return + logger.debug(f"logging model {estimator}") + if estimator.endswith("_spark"): + mlflow.spark.log_model(model, estimator, signature=signature) + mlflow.spark.log_model(model, "model", signature=signature) + elif estimator in ["lgbm"]: + mlflow.lightgbm.log_model(model, estimator, signature=signature) + elif estimator in ["transformer", "transformer_ms"]: + mlflow.transformers.log_model(model, estimator, signature=signature) + elif estimator in ["arima", "sarimax", "holt-winters", "snaive", "naive", "savg", "avg", "ets"]: + mlflow.statsmodels.log_model(model, estimator, signature=signature) + elif estimator in ["tcn", "tft"]: + mlflow.pytorch.log_model(model, estimator, signature=signature) + elif estimator in ["prophet"]: + mlflow.prophet.log_model(model, estimator, signature=signature) + elif estimator in ["orbit"]: + pass + else: + mlflow.sklearn.log_model(model, estimator, signature=signature) + + def _pickle_and_log_artifact(self, obj, artifact_name, pickle_fpath="temp_.pkl"): + if not self._do_log_model: + return + with open(pickle_fpath, "wb") as f: + pickle.dump(obj, f) + mlflow.log_artifact(pickle_fpath, artifact_name) + + def pickle_and_log_automl_artifacts(self, automl, model, estimator, signature=None): + """log automl artifacts to mlflow + load back with `automl = mlflow.pyfunc.load_model(model_run_id_or_uri)`, then do prediction with `automl.predict(X)` + """ + logger.debug(f"logging automl artifacts {estimator}") + self._pickle_and_log_artifact(automl.feature_transformer, "feature_transformer", "feature_transformer.pkl") + self._pickle_and_log_artifact(automl.label_transformer, "label_transformer", "label_transformer.pkl") + # Test test_mlflow 1 and 4 will get error: TypeError: cannot pickle '_io.TextIOWrapper' object + # try: + # self._pickle_and_log_artifact(automl, "automl", "automl.pkl") + # except TypeError: + # pass + if estimator.endswith("_spark"): + # spark pipeline is not supported yet + return + feature_transformer = automl.feature_transformer + if isinstance(feature_transformer, Pipeline): + pipeline = feature_transformer + pipeline.steps.append(("estimator", model)) + elif isinstance(feature_transformer, SparkPipeline): + pipeline = feature_transformer + pipeline.stages.append(model) + elif not estimator.endswith("_spark"): + steps = [("feature_transformer", feature_transformer)] + steps.append(("estimator", model)) + pipeline = Pipeline(steps) + else: + stages = [feature_transformer] + stages.append(model) + pipeline = SparkPipeline(stages=stages) + if isinstance(pipeline, SparkPipeline): + logger.debug(f"logging spark pipeline {estimator}") + mlflow.spark.log_model(pipeline, "automl_pipeline", signature=signature) + else: + # Add a log named "model" to fit default settings + logger.debug(f"logging sklearn pipeline {estimator}") + mlflow.sklearn.log_model(pipeline, "automl_pipeline", signature=signature) + mlflow.sklearn.log_model(pipeline, "model", signature=signature) + + def record_state(self, automl, search_state, estimator): + _st = time.time() + automl_metric_name = ( + automl._state.metric if isinstance(automl._state.metric, str) else automl._state.error_metric + ) + + if automl._state.error_metric.startswith("1-"): + automl_metric_value = 1 - search_state.val_loss + elif automl._state.error_metric.startswith("-"): + automl_metric_value = -search_state.val_loss + else: + automl_metric_value = search_state.val_loss + + if "ml" in search_state.config: + config = search_state.config["ml"] + else: + config = search_state.config + + info = { + "metrics": { + "iter_counter": automl._track_iter, + "trial_time": search_state.trial_time, + "wall_clock_time": automl._state.time_from_start, + "validation_loss": search_state.val_loss, + "best_validation_loss": search_state.best_loss, + automl_metric_name: automl_metric_value, + }, + "tags": { + "flaml.best_run": False, + "flaml.estimator_name": estimator, + "flaml.estimator_class": search_state.learner_class.__name__, + "flaml.iteration_number": automl._track_iter, + "flaml.version": __version__, + "flaml.learner": estimator, + "flaml.sample_size": search_state.sample_size, + "flaml.meric": automl_metric_name, + "flaml.run_source": "flaml-automl", + "flaml.log_type": self.log_type, + "flaml.automl_user_configurations": json.dumps(automl._automl_user_configurations), + }, + "params": { + "sample_size": search_state.sample_size, + "learner": estimator, + **config, + }, + "submetrics": { + "iter_counter": automl._iter_per_learner[estimator], + "values": [], + }, + } + + if (search_state.metric_for_logging is not None) and ( + "intermediate_results" in search_state.metric_for_logging + ): + info["submetrics"]["values"] = search_state.metric_for_logging["intermediate_results"] + + self.infos.append(info) + + if not self.autolog and not self.manual_log: + return + if self.manual_log: + if self.parent_run_name is not None: + run_name = f"{self.parent_run_name}_child_{self.child_counter}" + else: + run_name = None + with mlflow.start_run(nested=True, run_name=run_name) as child_run: + self._log_info_to_run(info, child_run.info.run_id, log_params=True) + if automl._state.model_history: + self.log_model( + search_state.trained_estimator._model, estimator, signature=automl.estimator_signature + ) + self.pickle_and_log_automl_artifacts( + automl, search_state.trained_estimator, estimator, signature=automl.pipeline_signature + ) + self.manual_run_ids.append(child_run.info.run_id) + self.child_counter += 1 + + def log_automl(self, automl): + self.set_best_iter(automl) + if self.autolog: + if self.parent_run_id is not None: + mlflow.start_run(run_id=self.parent_run_id, experiment_id=self.experiment_id) + mlflow.log_metric("best_validation_loss", automl._state.best_loss) + mlflow.log_metric("best_iteration", automl._best_iteration) + mlflow.log_metric("num_child_runs", len(self.infos)) + if automl._trained_estimator is not None and not self.has_model: + self.log_model( + automl._trained_estimator._model, automl.best_estimator, signature=automl.estimator_signature + ) + self.pickle_and_log_automl_artifacts( + automl, automl.model, automl.best_estimator, signature=automl.pipeline_signature + ) + self.has_model = True + + self.adopt_children(automl) + + if self.manual_log: + best_mlflow_run_id = self.manual_run_ids[automl._best_iteration] + best_run_name = self.mlflow_client.get_run(best_mlflow_run_id).info.run_name + automl.best_run_id = best_mlflow_run_id + automl.best_run_name = best_run_name + self.mlflow_client.set_tag(best_mlflow_run_id, "flaml.best_run", True) + self.best_run_id = best_mlflow_run_id + if self.parent_run_id is not None: + conf = automl._config_history[automl._best_iteration][1].copy() + if "ml" in conf.keys(): + conf = conf["ml"] + + mlflow.log_params(conf) + mlflow.log_param("best_learner", automl._best_estimator) + if not self.has_summary: + logger.info(f"logging best model {automl.best_estimator}") + self.copy_mlflow_run(best_mlflow_run_id, self.parent_run_id) + self.has_summary = True + if automl._trained_estimator is not None and not self.has_model: + self.log_model( + automl._trained_estimator._model, + automl.best_estimator, + signature=automl.estimator_signature, + ) + self.pickle_and_log_automl_artifacts( + automl, automl.model, automl.best_estimator, signature=automl.pipeline_signature + ) + self.has_model = True + + def resume_mlflow(self): + if len(self.resume_params) > 0: + mlflow.autolog(**self.resume_params) + + def _log_info_to_run(self, info, run_id, log_params=False): + _metrics = [Metric(key, value, int(time.time() * 1000), 0) for key, value in info["metrics"].items()] + _tags = [RunTag(key, str(value)) for key, value in info["tags"].items()] + _params = [ + Param(key, str(value)) + for key, value in info["params"].items() + if log_params or key in ["sample_size", "learner"] + ] + self.mlflow_client.log_batch(run_id=run_id, metrics=_metrics, params=_params, tags=_tags) + + if len(info["submetrics"]["values"]) > 0: + for each_entry in info["submetrics"]["values"]: + with mlflow.start_run(nested=True) as run: + each_entry.update({"iter_counter": info["submetrics"]["iter_counter"]}) + _metrics = [Metric(key, value, int(time.time() * 1000), 0) for key, value in each_entry.items()] + _tags = [RunTag("mlflow.parentRunId", run_id)] + self.mlflow_client.log_batch(run_id=run.info.run_id, metrics=_metrics, params=[], tags=_tags) + del info["submetrics"]["values"] + + def adopt_children(self, result=None): + """ + Set autologging child runs to nested by fetching them after all child runs are completed. + Note that this may cause disorder when concurrently starting multiple AutoML processes + with the same experiment name if the MLflow version is less than or equal to "2.5.0". + """ + if self.autolog: + best_iteration = self.best_iteration + if best_iteration is None: + logger.warning("best_iteration is None, cannot identify best run") + raw_autolog_child_runs = mlflow.search_runs( + experiment_ids=[self.experiment_id], + order_by=["attributes.start_time DESC"], + max_results=SEARCH_MAX_RESULTS, + output_format="list", + filter_string=( + f"tags.extra_tag.sid = '{self.extra_tag['extra_tag.sid']}'" if mlflow.__version__ > "2.5.0" else "" + ), + ) + self.child_counter = 0 + + # From latest to earliest, remove duplicate cross-validation runs + _exist_child_run_params = [] # for deduplication of cross-validation child runs + _to_keep_autolog_child_runs = [] + for autolog_child_run in raw_autolog_child_runs: + child_start_time = autolog_child_run.info.start_time / 1000 + + if child_start_time < self.start_time: + continue + + _current_child_run_params = autolog_child_run.data.params + # remove n_estimators as some models will train with small n_estimators to estimate time budget + if self.experiment_type == "automl": + _current_child_run_params.pop("n_estimators", None) + if _current_child_run_params in _exist_child_run_params: + # remove duplicate cross-validation run + self.mlflow_client.delete_run(autolog_child_run.info.run_id) + continue + else: + _exist_child_run_params.append(_current_child_run_params) + _to_keep_autolog_child_runs.append(autolog_child_run) + + # From earliest to latest, set tags and child_counter + autolog_child_runs = _to_keep_autolog_child_runs[::-1] + for autolog_child_run in autolog_child_runs: + child_run_id = autolog_child_run.info.run_id + child_run_parent_id = autolog_child_run.data.tags.get("mlflow.parentRunId", None) + child_start_time = autolog_child_run.info.start_time / 1000 + + if child_start_time < self.start_time: + continue + + if all( + [ + len(autolog_child_run.data.params) == 0, + len(autolog_child_run.data.metrics) == 0, + child_run_id != self.parent_run_id, + ] + ): + # remove empty run + # empty run could be created by mlflow autologging + self.mlflow_client.delete_run(autolog_child_run.info.run_id) + continue + + if all( + [ + child_run_id != self.parent_run_id, + child_run_parent_id is None or child_run_parent_id == self.parent_run_id, + ] + ): + if self.parent_run_id is not None: + self.mlflow_client.set_tag( + child_run_id, + "mlflow.parentRunId", + self.parent_run_id, + ) + if IS_RENAME_CHILD_RUN: + self.mlflow_client.set_tag( + child_run_id, + "mlflow.runName", + f"{self.parent_run_name}_child_{self.child_counter}", + ) + self.mlflow_client.set_tag(child_run_id, "flaml.child_counter", self.child_counter) + + # merge autolog child run and corresponding manual run + flaml_info = self.infos[self.child_counter] + child_run = self.mlflow_client.get_run(child_run_id) + self._log_info_to_run(flaml_info, child_run_id, log_params=False) + + if self.experiment_type == "automl": + if "learner" not in child_run.data.params: + self.mlflow_client.log_param(child_run_id, "learner", flaml_info["params"]["learner"]) + if "sample_size" not in child_run.data.params: + self.mlflow_client.log_param( + child_run_id, "sample_size", flaml_info["params"]["sample_size"] + ) + + if self.child_counter == best_iteration: + self.mlflow_client.set_tag(child_run_id, "flaml.best_run", True) + if result is not None: + result.best_run_id = child_run_id + result.best_run_name = child_run.info.run_name + self.best_run_id = child_run_id + if self.parent_run_id is not None and not self.has_summary: + self.copy_mlflow_run(child_run_id, self.parent_run_id) + self.has_summary = True + self.child_counter += 1 + + def retrain(self, train_func, config): + """retrain with given config, added for logging the best config and model to parent run. + No more needed after v2.0.2post2 as we no longer log best config and model to parent run. + """ + if self.autolog: + self.set_mlflow_config() + self.has_summary = True + with mlflow.start_run(run_id=self.parent_run_id): + train_func(config) + + def __del__(self): + # mlflow.end_run() # this will end the parent run when re-fit an AutoML instance. Bug 2922020: Inconsistent Run Creation Output + self.resume_mlflow() + + +def register_automl_pipeline(automl, model_name=None, signature=None): + pipeline = automl.automl_pipeline + if pipeline is None: + logger.warning("pipeline not found, cannot register it") + return + if model_name is None: + model_name = automl._mlflow_exp_name + "_pipeline" + if automl.best_run_id is None: + mlflow.sklearn.log_model( + pipeline, + "automl_pipeline", + registered_model_name=model_name, + signature=automl.pipeline_signature if signature is None else signature, + ) + mvs = mlflow.search_model_versions( + filter_string=f"name='{model_name}'", order_by=["attribute.version_number ASC"], max_results=1 + ) + return mvs[0] + else: + best_run = mlflow.get_run(automl.best_run_id) + model_uri = f"runs:/{best_run.info.run_id}/automl_pipeline" + return mlflow.register_model(model_uri, model_name) diff --git a/flaml/tune/tune.py b/flaml/tune/tune.py index ce249621d5..0bae6b9109 100644 --- a/flaml/tune/tune.py +++ b/flaml/tune/tune.py @@ -29,6 +29,18 @@ from .result import DEFAULT_METRIC from .trial import Trial +try: + import mlflow +except ImportError: + mlflow = None +try: + from flaml.fabric.mlflow import MLflowIntegration, is_autolog_enabled + + internal_mlflow = True +except ImportError: + internal_mlflow = False + + logger = logging.getLogger(__name__) logger.propagate = False _use_ray = True @@ -44,6 +56,7 @@ class ExperimentAnalysis(EA): """Class for storing the experiment results.""" def __init__(self, trials, metric, mode, lexico_objectives=None): + self.best_run_id = None try: super().__init__(self, None, trials, metric, mode) self.lexico_objectives = lexico_objectives @@ -128,6 +141,16 @@ def best_result(self) -> Dict: else: return self.best_trial.last_result + @property + def best_iteration(self) -> List[str]: + """Help better navigate""" + best_trial = self.best_trial + best_trial_id = best_trial.trial_id + for i, trial in enumerate(self.trials): + if trial.trial_id == best_trial_id: + return i + return None + def report(_metric=None, **kwargs): """A function called by the HPO application to report final or intermediate @@ -234,6 +257,9 @@ def run( lexico_objectives: Optional[dict] = None, force_cancel: Optional[bool] = False, n_concurrent_trials: Optional[int] = 0, + mlflow_exp_name: Optional[str] = None, + automl_info: Optional[Tuple[float]] = None, + extra_tag: Optional[dict] = None, **ray_args, ): """The function-based way of performing HPO. @@ -424,6 +450,10 @@ def easy_objective(config): } ``` force_cancel: boolean, default=False | Whether to forcely cancel the PySpark job if overtime. + mlflow_exp_name: str, default=None | The name of the mlflow experiment. This should be specified if + enable mlflow autologging on Spark. Otherwise it will log all the results into the experiment of the + same name as the basename of main entry file. + automl_info: tuple, default=None | The information of the automl run. It should be a tuple of (mlflow_log_latency,). n_concurrent_trials: int, default=0 | The number of concurrent trials when perform hyperparameter tuning with Spark. Only valid when use_spark=True and spark is required: `pip install flaml[spark]`. Please check @@ -431,6 +461,7 @@ def easy_objective(config): for more details about installing Spark. When tune.run() is called from AutoML, it will be overwritten by the value of `n_concurrent_trials` in AutoML. When <= 0, the concurrent trials will be set to the number of executors. + extra_tag: dict, default=None | Extra tags to be added to the mlflow runs created by autologging. **ray_args: keyword arguments to pass to ray.tune.run(). Only valid when use_ray=True. """ @@ -438,10 +469,12 @@ def easy_objective(config): global _verbose global _running_trial global _training_iteration + global internal_mlflow old_use_ray = _use_ray old_verbose = _verbose old_running_trial = _running_trial old_training_iteration = _training_iteration + if log_file_name: dir_name = os.path.dirname(log_file_name) if dir_name: @@ -486,6 +519,13 @@ def easy_objective(config): else: logger.setLevel(logging.CRITICAL) + if internal_mlflow and not automl_info and (mlflow.active_run() or is_autolog_enabled()): + mlflow_integration = MLflowIntegration("tune", mlflow_exp_name, extra_tag) + evaluation_function = mlflow_integration.wrap_evaluation_function(evaluation_function) + _internal_mlflow = not automl_info # True if mlflow_integration will be used for logging + else: + _internal_mlflow = False + from .searcher.blendsearch import CFO, BlendSearch, RandomSearch if lexico_objectives is not None: @@ -713,11 +753,15 @@ def easy_objective(config): time_budget_s = np.inf num_failures = 0 upperbound_num_failures = (len(evaluated_rewards) if evaluated_rewards else 0) + max_failure + logger.debug(f"automl_info: {automl_info}") while ( time.time() - time_start < time_budget_s and (num_samples < 0 or num_trials < num_samples) and num_failures < upperbound_num_failures ): + if automl_info and automl_info[0] > 0 and time_budget_s < np.inf: + time_budget_s -= automl_info[0] + logger.debug(f"Remaining time budget with mlflow log latency: {time_budget_s} seconds.") while len(_runner.running_trials) < n_concurrent_trials: # suggest trials for spark trial_next = _runner.step() @@ -750,6 +794,9 @@ def easy_objective(config): trial_to_run = trials_to_run[0] _runner.running_trial = trial_to_run if result is not None: + if _internal_mlflow: + mlflow_integration.record_trial(result, trial_to_run, metric) + if isinstance(result, dict): if result: logger.info(f"Brief result: {result}") @@ -768,6 +815,20 @@ def easy_objective(config): mode=mode, lexico_objectives=lexico_objectives, ) + analysis.search_space = config + + if _internal_mlflow: + mlflow_integration.log_tune(analysis, metric) + # try: + # _best_config = analysis.best_config + # except Exception: + # _best_config = None + # if _best_config: + # parallel( + # delayed(mlflow_integration.retrain)(evaluation_function, analysis.best_config) + # for dummy in [0] + # ) + return analysis finally: # recover the global variables in case of nested run @@ -779,6 +840,8 @@ def easy_objective(config): _runner = old_runner logger.handlers = old_handlers logger.setLevel(old_level) + if _internal_mlflow: + mlflow_integration.adopt_children() # simple sequential run without using tune.run() from ray time_start = time.time() @@ -812,7 +875,11 @@ def easy_objective(config): result = None with PySparkOvertimeMonitor(time_start, time_budget_s, force_cancel): result = evaluation_function(trial_to_run.config) + logger.debug(f"result in tune: {trial_to_run}, {result}") if result is not None: + if _internal_mlflow: + mlflow_integration.record_trial(result, trial_to_run, metric) + if isinstance(result, dict): if result: report(**result) @@ -838,6 +905,19 @@ def easy_objective(config): mode=mode, lexico_objectives=lexico_objectives, ) + analysis.search_space = config + if _internal_mlflow: + mlflow_integration.log_tune(analysis, metric) + if analysis.best_run_id is not None: + logger.info(f"Best MLflow run name: {analysis.best_run_name}") + logger.info(f"Best MLflow run id: {analysis.best_run_id}") + # try: + # _best_config = analysis.best_config + # except Exception: + # _best_config = None + # if _best_config: + # mlflow_integration.retrain(evaluation_function, analysis.best_config) + return analysis finally: # recover the global variables in case of nested run @@ -849,6 +929,8 @@ def easy_objective(config): _runner = old_runner logger.handlers = old_handlers logger.setLevel(old_level) + if _internal_mlflow: + mlflow_integration.adopt_children() class Tuner: diff --git a/setup.py b/setup.py index 8592d7fee5..5783e99de0 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,8 @@ "lightgbm>=2.3.1", "xgboost>=0.90,<2.0.0", "scipy>=1.4.1", - "pandas>=1.1.4", + "pandas>=1.1.4,<2.0.0; python_version<'3.10'", + "pandas>=1.1.4; python_version>='3.10'", "scikit-learn>=1.0.0", "thop", "pytest>=6.1.1", @@ -78,8 +79,8 @@ "hcrystalball==0.1.10", "seqeval", "pytorch-forecasting>=0.9.0,<=0.10.1; python_version<'3.11'", - "mlflow", - "pyspark>=3.2.0", + # "pytorch-forecasting==0.10.1; python_version=='3.11'", + "mlflow==2.15.1", "joblibspark>=0.5.0", "joblib<=1.3.2", "nbconvert", @@ -92,6 +93,7 @@ "pydantic==1.10.9", "sympy", "wolframalpha", + "dill", # a drop in replacement of pickle ], "catboost": [ "catboost>=0.26,<1.2; python_version<'3.11'", @@ -139,7 +141,8 @@ "prophet>=1.0.1", "statsmodels>=0.12.2", "hcrystalball==0.1.10", - "pytorch-forecasting>=0.9.0", + "pytorch-forecasting>=0.9.0; python_version<'3.11'", + # "pytorch-forecasting==0.10.1; python_version=='3.11'", "pytorch-lightning==1.9.0", "tensorboardX==2.6", ], diff --git a/test/automl/test_extra_models.py b/test/automl/test_extra_models.py new file mode 100644 index 0000000000..6c5cac0992 --- /dev/null +++ b/test/automl/test_extra_models.py @@ -0,0 +1,310 @@ +import os +import sys +import unittest +import warnings +from collections import defaultdict + +import mlflow +import numpy as np +import pandas as pd +import pytest +import scipy +from packaging.version import Version +from sklearn.datasets import load_breast_cancer, load_diabetes, load_iris +from sklearn.model_selection import train_test_split + +from flaml import AutoML +from flaml.automl.ml import sklearn_metric_loss_score +from flaml.tune.spark.utils import check_spark + +leaderboard = defaultdict(dict) + +warnings.simplefilter(action="ignore") +if sys.platform == "darwin" or "nt" in os.name: + # skip this test if the platform is not linux + skip_spark = True +else: + try: + import pyspark + from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator + from pyspark.ml.feature import VectorAssembler + + from flaml.automl.spark.utils import to_pandas_on_spark + + spark = ( + pyspark.sql.SparkSession.builder.appName("MyApp") + .master("local[2]") + .config( + "spark.jars.packages", + ( + "com.microsoft.azure:synapseml_2.12:1.0.2," + "org.apache.hadoop:hadoop-azure:3.3.5," + "com.microsoft.azure:azure-storage:8.6.6," + f"org.mlflow:mlflow-spark_2.12:{mlflow.__version__}" + if Version(mlflow.__version__) >= Version("2.9.0") + else f"org.mlflow:mlflow-spark:{mlflow.__version__}" + ), + ) + .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven") + .config("spark.sql.debug.maxToStringFields", "100") + .config("spark.driver.extraJavaOptions", "-Xss1m") + .config("spark.executor.extraJavaOptions", "-Xss1m") + .getOrCreate() + ) + spark.sparkContext._conf.set( + "spark.mlflow.pysparkml.autolog.logModelAllowlistFile", + "https://mmlspark.blob.core.windows.net/publicwasb/log_model_allowlist.txt", + ) + # spark.sparkContext.setLogLevel("ERROR") + spark_available, _ = check_spark() + skip_spark = not spark_available + except ImportError: + skip_spark = True + + +def _test_regular_models(estimator_list, task): + if isinstance(estimator_list, str): + estimator_list = [estimator_list] + if task == "classification": + load_dataset_func = load_iris + metric = "accuracy" + else: + load_dataset_func = load_diabetes + metric = "r2" + + x, y = load_dataset_func(return_X_y=True, as_frame=True) + x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7654321) + + automl_experiment = AutoML() + automl_settings = { + "max_iter": 5, + "task": task, + "estimator_list": estimator_list, + "metric": metric, + } + automl_experiment.fit(X_train=x_train, y_train=y_train, **automl_settings) + predictions = automl_experiment.predict(x_test) + score = sklearn_metric_loss_score(metric, predictions, y_test) + for estimator_name in estimator_list: + leaderboard[task][estimator_name] = score + + +def _test_spark_models(estimator_list, task): + if isinstance(estimator_list, str): + estimator_list = [estimator_list] + if task == "classification": + load_dataset_func = load_iris + evaluator = MulticlassClassificationEvaluator( + labelCol="target", predictionCol="prediction", metricName="accuracy" + ) + metric = "accuracy" + + elif task == "regression": + load_dataset_func = load_diabetes + evaluator = RegressionEvaluator(labelCol="target", predictionCol="prediction", metricName="r2") + metric = "r2" + + elif task == "binary": + load_dataset_func = load_breast_cancer + evaluator = MulticlassClassificationEvaluator( + labelCol="target", predictionCol="prediction", metricName="accuracy" + ) + metric = "accuracy" + + final_cols = ["target", "features"] + extra_args = {} + + if estimator_list is not None and "aft_spark" in estimator_list: + # survival analysis task + pd_df = pd.read_csv( + "https://raw.githubusercontent.com/CamDavidsonPilon/lifelines/master/lifelines/datasets/rossi.csv" + ) + pd_df.rename(columns={"week": "target"}, inplace=True) + final_cols += ["arrest"] + extra_args["censorCol"] = "arrest" + else: + pd_df = load_dataset_func(as_frame=True).frame + + rename = {} + for attr in pd_df.columns: + rename[attr] = attr.replace(" ", "_") + pd_df = pd_df.rename(columns=rename) + df = spark.createDataFrame(pd_df) + df = df.repartition(4) + train, test = df.randomSplit([0.8, 0.2], seed=7654321) + feature_cols = [col for col in df.columns if col not in ["target", "arrest"]] + featurizer = VectorAssembler(inputCols=feature_cols, outputCol="features") + train_data = featurizer.transform(train)[final_cols] + test_data = featurizer.transform(test)[final_cols] + automl = AutoML() + settings = { + "max_iter": 1, + "estimator_list": estimator_list, # ML learner we intend to test + "task": task, # task type + "metric": metric, # metric to optimize + } + settings.update(extra_args) + df = to_pandas_on_spark(to_pandas_on_spark(train_data).to_spark(index_col="index")) + + automl.fit( + dataframe=df, + label="target", + **settings, + ) + + model = automl.model.estimator + predictions = model.transform(test_data) + predictions.show(5) + + score = evaluator.evaluate(predictions) + if estimator_list is not None: + for estimator_name in estimator_list: + leaderboard[task][estimator_name] = score + + +def _test_sparse_matrix_classification(estimator): + automl_experiment = AutoML() + automl_settings = { + "estimator_list": [estimator], + "time_budget": 2, + "metric": "auto", + "task": "classification", + "log_file_name": "test/sparse_classification.log", + "split_type": "uniform", + "n_jobs": 1, + "model_history": True, + } + X_train = scipy.sparse.random(1554, 21, dtype=int) + y_train = np.random.randint(3, size=1554) + automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) + + +def load_multi_dataset(): + """multivariate time series forecasting dataset""" + import pandas as pd + + # pd.set_option("display.max_rows", None, "display.max_columns", None) + df = pd.read_csv( + "https://raw.githubusercontent.com/srivatsan88/YouTubeLI/master/dataset/nyc_energy_consumption.csv" + ) + # preprocessing data + df["timeStamp"] = pd.to_datetime(df["timeStamp"]) + df = df.set_index("timeStamp") + df = df.resample("D").mean() + df["temp"] = df["temp"].fillna(method="ffill") + df["precip"] = df["precip"].fillna(method="ffill") + df = df[:-2] # last two rows are NaN for 'demand' column so remove them + df = df.reset_index() + + return df + + +def _test_forecast(estimator_list, budget=10): + if isinstance(estimator_list, str): + estimator_list = [estimator_list] + df = load_multi_dataset() + # split data into train and test + time_horizon = 180 + num_samples = df.shape[0] + split_idx = num_samples - time_horizon + train_df = df[:split_idx] + test_df = df[split_idx:] + # test dataframe must contain values for the regressors / multivariate variables + X_test = test_df[["timeStamp", "precip", "temp"]] + y_test = test_df["demand"] + # return + automl = AutoML() + settings = { + "time_budget": budget, # total running time in seconds + "metric": "mape", # primary metric + "task": "ts_forecast", # task type + "log_file_name": "test/energy_forecast_numerical.log", # flaml log file + "log_dir": "logs/forecast_logs", # tcn/tft log folder + "eval_method": "holdout", + "log_type": "all", + "label": "demand", + "estimator_list": estimator_list, + } + """The main flaml automl API""" + automl.fit(dataframe=train_df, **settings, period=time_horizon) + print(automl.best_config) + pred_y = automl.predict(X_test) + mape = sklearn_metric_loss_score("mape", pred_y, y_test) + for estimator_name in estimator_list: + leaderboard["forecast"][estimator_name] = mape + + +class TestExtraModel(unittest.TestCase): + @unittest.skipIf(skip_spark, reason="Spark is not installed. Skip all spark tests.") + def test_rf_spark(self): + tasks = ["classification", "regression"] + for task in tasks: + _test_spark_models("rf_spark", task) + + @unittest.skipIf(skip_spark, reason="Spark is not installed. Skip all spark tests.") + def test_nb_spark(self): + _test_spark_models("nb_spark", "classification") + + @unittest.skipIf(skip_spark, reason="Spark is not installed. Skip all spark tests.") + def test_glr(self): + _test_spark_models("glr_spark", "regression") + + @unittest.skipIf(skip_spark, reason="Spark is not installed. Skip all spark tests.") + def test_lr(self): + _test_spark_models("lr_spark", "regression") + + @unittest.skipIf(skip_spark, reason="Spark is not installed. Skip all spark tests.") + def test_svc_spark(self): + _test_spark_models("svc_spark", "binary") + + @unittest.skipIf(skip_spark, reason="Spark is not installed. Skip all spark tests.") + def test_gbt_spark(self): + tasks = ["binary", "regression"] + for task in tasks: + _test_spark_models("gbt_spark", task) + + @unittest.skipIf(skip_spark, reason="Spark is not installed. Skip all spark tests.") + def test_aft(self): + _test_spark_models("aft_spark", "regression") + + @unittest.skipIf(skip_spark, reason="Spark is not installed. Skip all spark tests.") + def test_default_spark(self): + _test_spark_models(None, "classification") + + def test_svc(self): + _test_regular_models("svc", "classification") + _test_sparse_matrix_classification("svc") + + def test_sgd(self): + tasks = ["classification", "regression"] + for task in tasks: + _test_regular_models("sgd", task) + _test_sparse_matrix_classification("sgd") + + def test_enet(self): + _test_regular_models("enet", "regression") + + def test_lassolars(self): + _test_regular_models("lassolars", "regression") + _test_forecast("lassolars") + + def test_seasonal_naive(self): + _test_forecast("snaive") + + def test_naive(self): + _test_forecast("naive") + + def test_seasonal_avg(self): + _test_forecast("savg") + + def test_avg(self): + _test_forecast("avg") + + @unittest.skipIf(skip_spark, reason="Skip on Mac or Windows") + def test_tcn(self): + _test_forecast("tcn") + + +if __name__ == "__main__": + unittest.main() + print(leaderboard) diff --git a/test/automl/test_forecast.py b/test/automl/test_forecast.py index 8f0a24a1cf..6e5d97d4f8 100644 --- a/test/automl/test_forecast.py +++ b/test/automl/test_forecast.py @@ -1,4 +1,5 @@ import datetime +import os import sys import numpy as np @@ -95,6 +96,7 @@ def test_forecast_automl(budget=10, estimators_when_no_prophet=["arima", "sarima ) +@pytest.mark.skipif(sys.platform == "darwin" or "nt" in os.name, reason="skip on mac or windows") def test_models(budget=3): n = 200 X = pd.DataFrame( @@ -571,7 +573,7 @@ def test_forecast_panel(budget=5): print(f"Training duration of best run: {automl.best_config_train_time}s") print(automl.model.estimator) """ pickle and save the automl object """ - import pickle + import dill as pickle with open("automl.pkl", "wb") as f: pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL) diff --git a/test/automl/test_mlflow.py b/test/automl/test_mlflow.py index 36fc53dba8..3ce893d223 100644 --- a/test/automl/test_mlflow.py +++ b/test/automl/test_mlflow.py @@ -1,3 +1,5 @@ +import pickle + import mlflow import mlflow.entities import pytest @@ -9,43 +11,55 @@ class TestMLFlowLoggingParam: def test_should_start_new_run_by_default(self, automl_settings): - with mlflow.start_run(): - parent = mlflow.last_active_run() + with mlflow.start_run() as parent_run: automl = AutoML() X_train, y_train = load_iris(return_X_y=True) automl.fit(X_train=X_train, y_train=y_train, **automl_settings) + try: + self._check_mlflow_parameters(automl, parent_run.info) + except FileNotFoundError: + print("[WARNING]: No file found") - children = self._get_child_runs(parent) + children = self._get_child_runs(parent_run) assert len(children) >= 1, f"Expected at least 1 child run, got {len(children)}" def test_should_not_start_new_run_when_mlflow_logging_set_to_false_in_init(self, automl_settings): - with mlflow.start_run(): - parent = mlflow.last_active_run() + with mlflow.start_run() as parent_run: automl = AutoML(mlflow_logging=False) X_train, y_train = load_iris(return_X_y=True) automl.fit(X_train=X_train, y_train=y_train, **automl_settings) + try: + self._check_mlflow_parameters(automl, parent_run.info) + except FileNotFoundError: + print("[WARNING]: No file found") - children = self._get_child_runs(parent) + children = self._get_child_runs(parent_run) assert len(children) == 0, f"Expected 0 child runs, got {len(children)}" def test_should_not_start_new_run_when_mlflow_logging_set_to_false_in_fit(self, automl_settings): - with mlflow.start_run(): - parent = mlflow.last_active_run() + with mlflow.start_run() as parent_run: automl = AutoML() X_train, y_train = load_iris(return_X_y=True) automl.fit(X_train=X_train, y_train=y_train, mlflow_logging=False, **automl_settings) + try: + self._check_mlflow_parameters(automl, parent_run.info) + except FileNotFoundError: + print("[WARNING]: No file found") - children = self._get_child_runs(parent) + children = self._get_child_runs(parent_run) assert len(children) == 0, f"Expected 0 child runs, got {len(children)}" def test_should_start_new_run_when_mlflow_logging_set_to_true_in_fit(self, automl_settings): - with mlflow.start_run(): - parent = mlflow.last_active_run() + with mlflow.start_run() as parent_run: automl = AutoML(mlflow_logging=False) X_train, y_train = load_iris(return_X_y=True) automl.fit(X_train=X_train, y_train=y_train, mlflow_logging=True, **automl_settings) + try: + self._check_mlflow_parameters(automl, parent_run.info) + except FileNotFoundError: + print("[WARNING]: No file found") - children = self._get_child_runs(parent) + children = self._get_child_runs(parent_run) assert len(children) >= 1, f"Expected at least 1 child run, got {len(children)}" @staticmethod @@ -55,11 +69,40 @@ def _get_child_runs(parent_run: mlflow.entities.Run) -> DataFrame: [experiment_id], filter_string=f"tags.mlflow.parentRunId = '{parent_run.info.run_id}'" ) + @staticmethod + def _check_mlflow_parameters(automl: AutoML, run_info: mlflow.entities.RunInfo): + with open( + f"./mlruns/{run_info.experiment_id}/{run_info.run_id}/artifacts/automl_pipeline/model.pkl", "rb" + ) as f: + t = pickle.load(f) + if __name__ == "__main__": + print(t) + for param in automl.model._model._get_param_names(): + assert eval("t._final_estimator._model" + f".{param}") == eval( + "automl.model._model" + f".{param}" + ), "The mlflow logging not consistent with automl model" + if __name__ == "__main__": + print(param, "\t", eval("automl.model._model" + f".{param}")) + print("[INFO]: Successfully Logged") + @pytest.fixture(scope="class") def automl_settings(self): + mlflow.end_run() return { - "time_budget": 2, # in seconds + "time_budget": 5, # in seconds "metric": "accuracy", "task": "classification", "log_file_name": "iris.log", } + + +if __name__ == "__main__": + s = TestMLFlowLoggingParam() + automl_settings = { + "time_budget": 5, # in seconds + "metric": "accuracy", + "task": "classification", + "log_file_name": "iris.log", + } + s.test_should_start_new_run_by_default(automl_settings) + s.test_should_start_new_run_when_mlflow_logging_set_to_true_in_fit(automl_settings) diff --git a/test/spark/test_0sparkml.py b/test/spark/test_0sparkml.py index 8ff3a1f2af..3f2198241c 100644 --- a/test/spark/test_0sparkml.py +++ b/test/spark/test_0sparkml.py @@ -5,6 +5,7 @@ import mlflow import pytest import sklearn.datasets as skds +from packaging.version import Version from flaml import AutoML from flaml.tune.spark.utils import check_spark @@ -20,23 +21,26 @@ from flaml.automl.spark.utils import to_pandas_on_spark - postfix_version = "-spark3.3," if pyspark.__version__ > "3.2" else "," spark = ( pyspark.sql.SparkSession.builder.appName("MyApp") .master("local[2]") .config( "spark.jars.packages", ( - f"com.microsoft.azure:synapseml_2.12:0.11.3{postfix_version}" + "com.microsoft.azure:synapseml_2.12:1.0.4," "org.apache.hadoop:hadoop-azure:3.3.5," "com.microsoft.azure:azure-storage:8.6.6," - f"org.mlflow:mlflow-spark:2.6.0" + f"org.mlflow:mlflow-spark_2.12:{mlflow.__version__}" + if Version(mlflow.__version__) >= Version("2.9.0") + else f"org.mlflow:mlflow-spark:{mlflow.__version__}" ), ) .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven") .config("spark.sql.debug.maxToStringFields", "100") .config("spark.driver.extraJavaOptions", "-Xss1m") .config("spark.executor.extraJavaOptions", "-Xss1m") + # .config("spark.executor.memory", "48G") + # .config("spark.driver.memory", "48G") .getOrCreate() ) spark.sparkContext._conf.set( @@ -49,6 +53,10 @@ except ImportError: skip_spark = True +if sys.version_info >= (3, 11): + skip_py311 = True +else: + skip_py311 = False pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.") @@ -159,10 +167,11 @@ def test_spark_input_df(): settings = { "time_budget": 30, # total running time in seconds "metric": "roc_auc", - "estimator_list": ["lgbm_spark"], # list of ML learners; we tune lightgbm in this example + # "estimator_list": ["lgbm_spark"], # list of ML learners; we tune lightgbm in this example "task": "classification", # task type "log_file_name": "flaml_experiment.log", # flaml log file "seed": 7654321, # random seed + "eval_method": "holdout", } df = to_pandas_on_spark(to_pandas_on_spark(train_data).to_spark(index_col="index")) @@ -176,17 +185,17 @@ def test_spark_input_df(): try: model = automl.model.estimator predictions = model.transform(test_data) - predictions.show() - - # from synapse.ml.train import ComputeModelStatistics - # metrics = ComputeModelStatistics( - # evaluationMetric="classification", - # labelCol="Bankrupt?", - # scoredLabelsCol="prediction", - # ).transform(predictions) - # metrics.show() + from synapse.ml.train import ComputeModelStatistics + if not skip_py311: + # ComputeModelStatistics doesn't support python 3.11 + metrics = ComputeModelStatistics( + evaluationMetric="classification", + labelCol="Bankrupt?", + scoredLabelsCol="prediction", + ).transform(predictions) + metrics.show() except AttributeError: print("No fitted model because of too short training time.") @@ -207,6 +216,86 @@ def test_spark_input_df(): assert "No estimator is left." in str(excinfo.value) +def _test_spark_large_df(): + """Test with large dataframe, should not run in pipeline.""" + import os + import time + + import pandas as pd + from pyspark.sql import functions as F + + import flaml + + os.environ["FLAML_MAX_CONCURRENT"] = "8" + start_time = time.time() + + def load_higgs(): + # 11M rows, 29 columns, 1.1GB + df = ( + spark.read.format("csv") + .option("header", False) + .option("inferSchema", True) + .load("/datadrive/datasets/HIGGS.csv") + .withColumnRenamed("_c0", "target") + .withColumn("target", F.col("target").cast("integer")) + .limit(1000000) + .fillna(0) + .na.drop(how="any") + .repartition(64) + .cache() + ) + print("Number of rows in data: ", df.count()) + return df + + def load_bosch(): + # 1.184M rows, 969 cols, 1.5GB + df = ( + spark.read.format("csv") + .option("header", True) + .option("inferSchema", True) + .load("/datadrive/datasets/train_numeric.csv") + .withColumnRenamed("Response", "target") + .withColumn("target", F.col("target").cast("integer")) + .limit(1000000) + .fillna(0) + .drop("Id") + .repartition(64) + .cache() + ) + print("Number of rows in data: ", df.count()) + return df + + def prepare_data(dataset_name="higgs"): + df = load_higgs() if dataset_name == "higgs" else load_bosch() + train, test = df.randomSplit([0.75, 0.25], seed=7654321) + feature_cols = [col for col in df.columns if col not in ["target", "arrest"]] + final_cols = ["target", "features"] + featurizer = VectorAssembler(inputCols=feature_cols, outputCol="features") + train_data = featurizer.transform(train)[final_cols] + test_data = featurizer.transform(test)[final_cols] + train_data = to_pandas_on_spark(to_pandas_on_spark(train_data).to_spark(index_col="index")) + return train_data, test_data + + train_data, test_data = prepare_data("higgs") + end_time = time.time() + print("time cost in minutes for prepare data: ", (end_time - start_time) / 60) + automl = flaml.AutoML() + automl_settings = { + "max_iter": 3, + "time_budget": 7200, + "metric": "accuracy", + "task": "classification", + "seed": 1234, + "eval_method": "holdout", + } + automl.fit(dataframe=train_data, label="target", ensemble=False, **automl_settings) + model = automl.model.estimator + predictions = model.transform(test_data) + predictions.show(5) + end_time = time.time() + print("time cost in minutes: ", (end_time - start_time) / 60) + + if __name__ == "__main__": test_spark_synapseml_classification() test_spark_synapseml_regression() @@ -217,6 +306,6 @@ def test_spark_input_df(): # import pstats # from pstats import SortKey - # cProfile.run("test_spark_input_df()", "test_spark_input_df.profile") - # p = pstats.Stats("test_spark_input_df.profile") - # p.strip_dirs().sort_stats(SortKey.CUMULATIVE).print_stats("utils.py") + # cProfile.run("_test_spark_large_df()", "_test_spark_large_df.profile") + # p = pstats.Stats("_test_spark_large_df.profile") + # p.strip_dirs().sort_stats(SortKey.CUMULATIVE).print_stats(50) diff --git a/test/spark/test_mlflow.py b/test/spark/test_mlflow.py new file mode 100644 index 0000000000..5a809d5acd --- /dev/null +++ b/test/spark/test_mlflow.py @@ -0,0 +1,342 @@ +import importlib +import os +import sys +import time +import warnings + +import mlflow +import pytest +from packaging.version import Version +from sklearn.datasets import fetch_california_housing, load_diabetes +from sklearn.ensemble import RandomForestRegressor +from sklearn.metrics import r2_score +from sklearn.model_selection import train_test_split + +import flaml +from flaml.automl.spark.utils import to_pandas_on_spark + +try: + import pyspark + from pyspark.ml.evaluation import RegressionEvaluator + from pyspark.ml.feature import VectorAssembler +except ImportError: + pass +warnings.filterwarnings("ignore") + +skip_spark = importlib.util.find_spec("pyspark") is None +client = mlflow.tracking.MlflowClient() + +if (sys.platform.startswith("darwin") or sys.platform.startswith("nt")) and ( + sys.version_info[0] == 3 and sys.version_info[1] >= 10 +): + # TODO: remove this block when tests are stable + # Below tests will fail, but the functions run without error if run individually. + # test_tune_autolog_parentrun_nonparallel() + # test_tune_autolog_noparentrun_nonparallel() + # test_tune_noautolog_parentrun_nonparallel() + # test_tune_noautolog_noparentrun_nonparallel() + pytest.skip("skipping MacOS and Windows for python 3.10 and 3.11", allow_module_level=True) + +""" +The spark used in below tests should be initiated in test_0sparkml.py when run with pytest. +""" + + +def _sklearn_tune(config): + is_autolog = config.pop("is_autolog") + is_parent_run = config.pop("is_parent_run") + is_parallel = config.pop("is_parallel") + X, y = load_diabetes(return_X_y=True, as_frame=True) + train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.25) + rf = RandomForestRegressor(**config) + rf.fit(train_x, train_y) + pred = rf.predict(test_x) + r2 = r2_score(test_y, pred) + if not is_autolog and not is_parent_run and not is_parallel: + with mlflow.start_run(nested=True): + mlflow.log_metric("r2", r2) + return {"r2": r2} + + +def _test_tune(is_autolog, is_parent_run, is_parallel): + mlflow.end_run() + mlflow_exp_name = f"test_mlflow_integration_{int(time.time())}" + mlflow_experiment = mlflow.set_experiment(mlflow_exp_name) + params = { + "n_estimators": flaml.tune.randint(100, 1000), + "min_samples_leaf": flaml.tune.randint(1, 10), + "is_autolog": is_autolog, + "is_parent_run": is_parent_run, + "is_parallel": is_parallel, + } + if is_autolog: + mlflow.autolog() + else: + mlflow.autolog(disable=True) + if is_parent_run: + mlflow.start_run(run_name=f"tune_autolog_{is_autolog}_sparktrial_{is_parallel}") + flaml.tune.run( + _sklearn_tune, + params, + metric="r2", + mode="max", + num_samples=3, + use_spark=True if is_parallel else False, + n_concurrent_trials=2 if is_parallel else 1, + mlflow_exp_name=mlflow_exp_name, + ) + mlflow.end_run() # end current run + mlflow.autolog(disable=True) + return mlflow_experiment.experiment_id + + +def _check_mlflow_logging(possible_num_runs, metric, is_parent_run, experiment_id, is_automl=False, skip_tags=False): + if isinstance(possible_num_runs, int): + possible_num_runs = [possible_num_runs] + if is_parent_run: + parent_run = mlflow.last_active_run() + child_runs = client.search_runs( + experiment_ids=[experiment_id], + filter_string=f"tags.mlflow.parentRunId = '{parent_run.info.run_id}'", + ) + else: + child_runs = client.search_runs(experiment_ids=[experiment_id]) + experiment_name = client.get_experiment(experiment_id).name + metrics = [metric in run.data.metrics for run in child_runs] + tags = ["flaml.version" in run.data.tags for run in child_runs] + params = ["learner" in run.data.params for run in child_runs] + assert ( + len(child_runs) in possible_num_runs + ), f"The number of child runs is not correct on experiment {experiment_name}." + if possible_num_runs[0] > 0: + assert all(metrics), f"The metrics are not logged correctly on experiment {experiment_name}." + assert ( + all(tags) if not skip_tags else True + ), f"The tags are not logged correctly on experiment {experiment_name}." + assert ( + all(params) if is_automl else True + ), f"The params are not logged correctly on experiment {experiment_name}." + # mlflow.delete_experiment(experiment_id) + + +@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.") +def test_tune_autolog_parentrun_parallel(): + experiment_id = _test_tune(is_autolog=True, is_parent_run=True, is_parallel=True) + _check_mlflow_logging([4, 3], "r2", True, experiment_id) + + +def test_tune_autolog_parentrun_nonparallel(): + experiment_id = _test_tune(is_autolog=True, is_parent_run=True, is_parallel=False) + _check_mlflow_logging(3, "r2", True, experiment_id) + + +@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.") +def test_tune_autolog_noparentrun_parallel(): + experiment_id = _test_tune(is_autolog=True, is_parent_run=False, is_parallel=True) + _check_mlflow_logging([4, 3], "r2", False, experiment_id) + + +@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.") +def test_tune_noautolog_parentrun_parallel(): + experiment_id = _test_tune(is_autolog=False, is_parent_run=True, is_parallel=True) + _check_mlflow_logging([4, 3], "r2", True, experiment_id) + + +def test_tune_autolog_noparentrun_nonparallel(): + experiment_id = _test_tune(is_autolog=True, is_parent_run=False, is_parallel=False) + _check_mlflow_logging(3, "r2", False, experiment_id) + + +def test_tune_noautolog_parentrun_nonparallel(): + experiment_id = _test_tune(is_autolog=False, is_parent_run=True, is_parallel=False) + _check_mlflow_logging(3, "r2", True, experiment_id) + + +@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.") +def test_tune_noautolog_noparentrun_parallel(): + experiment_id = _test_tune(is_autolog=False, is_parent_run=False, is_parallel=True) + _check_mlflow_logging(0, "r2", False, experiment_id) + + +def test_tune_noautolog_noparentrun_nonparallel(): + experiment_id = _test_tune(is_autolog=False, is_parent_run=False, is_parallel=False) + _check_mlflow_logging(3, "r2", False, experiment_id, skip_tags=True) + + +def _test_automl_sparkdata(is_autolog, is_parent_run): + mlflow.end_run() + mlflow_exp_name = f"test_mlflow_integration_{int(time.time())}" + mlflow_experiment = mlflow.set_experiment(mlflow_exp_name) + if is_autolog: + mlflow.autolog() + else: + mlflow.autolog(disable=True) + if is_parent_run: + mlflow.start_run(run_name=f"automl_sparkdata_autolog_{is_autolog}") + spark = pyspark.sql.SparkSession.builder.getOrCreate() + pd_df = load_diabetes(as_frame=True).frame + df = spark.createDataFrame(pd_df) + df = df.repartition(4).cache() + train, test = df.randomSplit([0.8, 0.2], seed=1) + feature_cols = df.columns[:-1] + featurizer = VectorAssembler(inputCols=feature_cols, outputCol="features") + train_data = featurizer.transform(train)["target", "features"] + featurizer.transform(test)["target", "features"] + automl = flaml.AutoML() + settings = { + "max_iter": 3, + "metric": "mse", + "task": "regression", # task type + "log_file_name": "flaml_experiment.log", # flaml log file + "mlflow_exp_name": mlflow_exp_name, + "log_type": "all", + "n_splits": 2, + "model_history": True, + } + df = to_pandas_on_spark(to_pandas_on_spark(train_data).to_spark(index_col="index")) + automl.fit( + dataframe=df, + label="target", + **settings, + ) + mlflow.end_run() # end current run + mlflow.autolog(disable=True) + return mlflow_experiment.experiment_id + + +def _test_automl_nonsparkdata(is_autolog, is_parent_run): + mlflow_exp_name = f"test_mlflow_integration_{int(time.time())}" + mlflow_experiment = mlflow.set_experiment(mlflow_exp_name) + if is_autolog: + mlflow.autolog() + else: + mlflow.autolog(disable=True) + if is_parent_run: + mlflow.start_run(run_name=f"automl_nonsparkdata_autolog_{is_autolog}") + automl_experiment = flaml.AutoML() + automl_settings = { + "max_iter": 3, + "metric": "r2", + "task": "regression", + "n_concurrent_trials": 2, + "use_spark": True, + "mlflow_exp_name": None if is_parent_run else mlflow_exp_name, + "log_type": "all", + "n_splits": 2, + "model_history": True, + } + X, y = load_diabetes(return_X_y=True, as_frame=True) + train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.25) + automl_experiment.fit(X_train=train_x, y_train=train_y, **automl_settings) + mlflow.end_run() # end current run + mlflow.autolog(disable=True) + return mlflow_experiment.experiment_id + + +@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.") +def test_automl_sparkdata_autolog_parentrun(): + experiment_id = _test_automl_sparkdata(is_autolog=True, is_parent_run=True) + _check_mlflow_logging(3, "mse", True, experiment_id, is_automl=True) + + +@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.") +def test_automl_sparkdata_autolog_noparentrun(): + experiment_id = _test_automl_sparkdata(is_autolog=True, is_parent_run=False) + _check_mlflow_logging(3, "mse", False, experiment_id, is_automl=True) + + +@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.") +def test_automl_sparkdata_noautolog_parentrun(): + experiment_id = _test_automl_sparkdata(is_autolog=False, is_parent_run=True) + _check_mlflow_logging(3, "mse", True, experiment_id, is_automl=True) + + +@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.") +def test_automl_sparkdata_noautolog_noparentrun(): + experiment_id = _test_automl_sparkdata(is_autolog=False, is_parent_run=False) + _check_mlflow_logging(0, "mse", False, experiment_id, is_automl=True) # no logging + + +@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.") +def test_automl_nonsparkdata_autolog_parentrun(): + experiment_id = _test_automl_nonsparkdata(is_autolog=True, is_parent_run=True) + _check_mlflow_logging([4, 3], "r2", True, experiment_id, is_automl=True) + + +@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.") +def test_automl_nonsparkdata_autolog_noparentrun(): + experiment_id = _test_automl_nonsparkdata(is_autolog=True, is_parent_run=False) + _check_mlflow_logging([4, 3], "r2", False, experiment_id, is_automl=True) + + +@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.") +def test_automl_nonsparkdata_noautolog_parentrun(): + experiment_id = _test_automl_nonsparkdata(is_autolog=False, is_parent_run=True) + _check_mlflow_logging([4, 3], "r2", True, experiment_id, is_automl=True) + + +@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.") +def test_automl_nonsparkdata_noautolog_noparentrun(): + experiment_id = _test_automl_nonsparkdata(is_autolog=False, is_parent_run=False) + _check_mlflow_logging(0, "r2", False, experiment_id, is_automl=True) # no logging + + +@pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.") +def test_exit_pyspark_autolog(): + import pyspark + + spark = pyspark.sql.SparkSession.builder.getOrCreate() + spark.sparkContext._gateway.shutdown_callback_server() # this is to avoid stucking + mlflow.autolog(disable=True) + + +def _init_spark_for_main(): + import pyspark + + spark = ( + pyspark.sql.SparkSession.builder.appName("MyApp") + .master("local[2]") + .config( + "spark.jars.packages", + ( + "com.microsoft.azure:synapseml_2.12:1.0.4," + "org.apache.hadoop:hadoop-azure:3.3.5," + "com.microsoft.azure:azure-storage:8.6.6," + f"org.mlflow:mlflow-spark_2.12:{mlflow.__version__}" + if Version(mlflow.__version__) >= Version("2.9.0") + else f"org.mlflow:mlflow-spark:{mlflow.__version__}" + ), + ) + .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven") + .config("spark.sql.debug.maxToStringFields", "100") + .config("spark.driver.extraJavaOptions", "-Xss1m") + .config("spark.executor.extraJavaOptions", "-Xss1m") + .getOrCreate() + ) + spark.sparkContext._conf.set( + "spark.mlflow.pysparkml.autolog.logModelAllowlistFile", + "https://mmlspark.blob.core.windows.net/publicwasb/log_model_allowlist.txt", + ) + + +if __name__ == "__main__": + _init_spark_for_main() + + # test_tune_autolog_parentrun_parallel() + # test_tune_autolog_parentrun_nonparallel() + test_tune_autolog_noparentrun_parallel() # TODO: runs not removed + # test_tune_noautolog_parentrun_parallel() + # test_tune_autolog_noparentrun_nonparallel() + # test_tune_noautolog_parentrun_nonparallel() + # test_tune_noautolog_noparentrun_parallel() + # test_tune_noautolog_noparentrun_nonparallel() + # test_automl_sparkdata_autolog_parentrun() + # test_automl_sparkdata_autolog_noparentrun() + # test_automl_sparkdata_noautolog_parentrun() + # test_automl_sparkdata_noautolog_noparentrun() + # test_automl_nonsparkdata_autolog_parentrun() + # test_automl_nonsparkdata_autolog_noparentrun() # TODO: runs not removed + # test_automl_nonsparkdata_noautolog_parentrun() + # test_automl_nonsparkdata_noautolog_noparentrun() + + test_exit_pyspark_autolog()