Improve mlflow integration and add more models (#1331)

* Add more spark models and improved mlflow integration * Update test_extra_models, setup and gitignore * Remove autofe * Remove autofe * Remove autofe * Sync changes in internal * Fix test for env without pyspark * Fix import errors * Fix tests * Fix typos * Fix pytorch-forecasting version * Remove internal funcs, rename _mlflow.py * Fix import error * Fix dependency * Fix experiment name setting * Fix dependency * Update pandas version * Update pytorch-forecasting version * Add warning message for not has_automl * Fix test errors with nltk 3.8.2 * Don't enable mlflow logging w/o an active run * Fix pytorch-forecasting can't be pickled issue * Update pyspark tests condition * Update synapseml * Update synapseml * No parent run, no logging for OSS * Log when autolog is enabled * upgrade code * Enable autolog for tune * Increase time budget for test * End run before start a new run * Update parent run * Fix import error * clean up * skip macos and win * Update notes * Update default value of model_history
microsoft · Aug 12, 2024 · 635dfbd · 635dfbd
1 parent bd34b4e
commit 635dfbd
Show file tree

Hide file tree

Showing 22 changed files with 3,145 additions and 317 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -54,10 +54,15 @@ jobs:
           pip install -e .
           python -c "import flaml"
           pip install -e .[test]
-      - name: On Ubuntu python 3.8, install pyspark 3.2.3
-        if: matrix.python-version == '3.8' && matrix.os == 'ubuntu-latest'
+      - name: On Ubuntu python 3.10, install pyspark 3.4.1
+        if: matrix.python-version == '3.10' && matrix.os == 'ubuntu-latest'
         run: |
-          pip install pyspark==3.2.3
+          pip install pyspark==3.4.1
+          pip list | grep "pyspark"
+      - name: On Ubuntu python 3.11, install pyspark 3.5.1
+        if: matrix.python-version == '3.11' && matrix.os == 'ubuntu-latest'
+        run: |
+          pip install pyspark==3.5.1
           pip list | grep "pyspark"
       - name: If linux and python<3.11, install ray 2
         if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.11'
@@ -77,11 +82,6 @@ jobs:
         if: matrix.python-version == '3.8' || matrix.python-version == '3.9'
         run: |
           pip install -e .[vw]
-      - name: Uninstall pyspark on (python 3.9) or windows
-        if: matrix.python-version == '3.9' || matrix.os == 'windows-2019'
-        run: |
-          # Uninstall pyspark to test env without pyspark
-          pip uninstall -y pyspark
       - name: Test with pytest
         if: matrix.python-version != '3.10'
         run: |

diff --git a/.gitignore b/.gitignore
@@ -163,6 +163,24 @@ output/
 flaml/tune/spark/mylearner.py
 *.pkl
 
+data/
+benchmark/pmlb/csv_datasets
+benchmark/*.csv
+
+checkpoints/
+test/default
+test/housing.json
+test/nlp/default/transformer_ms/seq-classification.json
+
+flaml/fabric/fanova/_fanova.c
 # local config files
 *.config.local
+
+local_debug/
 patch.diff
+
+# Test things
+notebook/lightning_logs/
+lightning_logs/
+flaml/autogen/extensions/tmp/
+test/autogen/my_tmp/
diff --git a/flaml/__init__.py b/flaml/__init__.py
@@ -1,10 +1,18 @@
 import logging
 
-from flaml.automl import AutoML, logger_formatter
+try:
+    from flaml.automl import AutoML, logger_formatter
+
+    has_automl = True
+except ImportError:
+    has_automl = False
 from flaml.onlineml.autovw import AutoVW
 from flaml.tune.searcher import CFO, FLOW2, BlendSearch, BlendSearchTuner, RandomSearch
 from flaml.version import __version__
 
 # Set the root logger.
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
+
+if not has_automl:
+    logger.warning("flaml.automl is not available. Please install flaml[automl] to enable AutoML functionalities.")
diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py
diff --git a/flaml/automl/ml.py b/flaml/automl/ml.py
@@ -13,6 +13,7 @@
 from flaml.automl.spark import ERROR as SPARK_ERROR
 from flaml.automl.spark import DataFrame, Series, psDataFrame, psSeries
 from flaml.automl.task.task import Task
+from flaml.automl.time_series import TimeSeriesDataset
 
 try:
     from sklearn.metrics import (
@@ -33,7 +34,6 @@
 if SPARK_ERROR is None:
     from flaml.automl.spark.metrics import spark_metric_loss_score
 
-from flaml.automl.time_series import TimeSeriesDataset
 
 logger = logging.getLogger(__name__)
 
@@ -89,6 +89,11 @@
     "wer": "min",
 }
 huggingface_submetric_to_metric = {"rouge1": "rouge", "rouge2": "rouge"}
+spark_metric_name_dict = {
+    "Regression": ["r2", "rmse", "mse", "mae", "var"],
+    "Binary Classification": ["pr_auc", "roc_auc"],
+    "Multi-class Classification": ["accuracy", "log_loss", "f1", "micro_f1", "macro_f1"],
+}
 
 
 def metric_loss_score(
@@ -122,7 +127,7 @@ def metric_loss_score(
             import datasets
 
             datasets_metric_name = huggingface_submetric_to_metric.get(metric_name, metric_name.split(":")[0])
-            metric = datasets.load_metric(datasets_metric_name)
+            metric = datasets.load_metric(datasets_metric_name, trust_remote_code=True)
             metric_mode = huggingface_metric_to_mode[datasets_metric_name]
 
             if metric_name.startswith("seqeval"):
@@ -334,6 +339,14 @@ def compute_estimator(
     if fit_kwargs is None:
         fit_kwargs = {}
 
+    fe_params = {}
+    for param, value in config_dic.items():
+        if param.startswith("fe."):
+            fe_params[param] = value
+
+    for param, value in fe_params.items():
+        config_dic.pop(param)
+
     estimator_class = estimator_class or task.estimator_class_from_str(estimator_name)
     estimator = estimator_class(
         **config_dic,
@@ -401,12 +414,21 @@ def train_estimator(
     free_mem_ratio=0,
 ) -> Tuple[EstimatorSubclass, float]:
     start_time = time.time()
+    fe_params = {}
+    for param, value in config_dic.items():
+        if param.startswith("fe."):
+            fe_params[param] = value
+
+    for param, value in fe_params.items():
+        config_dic.pop(param)
+
     estimator_class = estimator_class or task.estimator_class_from_str(estimator_name)
     estimator = estimator_class(
         **config_dic,
         task=task,
         n_jobs=n_jobs,
     )
+
     if fit_kwargs is None:
         fit_kwargs = {}