Skip to content

Commit

Permalink
Improve mlflow integration and add more models (#1331)
Browse files Browse the repository at this point in the history
* Add more spark models and improved mlflow integration

* Update test_extra_models, setup and gitignore

* Remove autofe

* Remove autofe

* Remove autofe

* Sync changes in internal

* Fix test for env without pyspark

* Fix import errors

* Fix tests

* Fix typos

* Fix pytorch-forecasting version

* Remove internal funcs, rename _mlflow.py

* Fix import error

* Fix dependency

* Fix experiment name setting

* Fix dependency

* Update pandas version

* Update pytorch-forecasting version

* Add warning message for not has_automl

* Fix test errors with nltk 3.8.2

* Don't enable mlflow logging w/o an active run

* Fix pytorch-forecasting can't be pickled issue

* Update pyspark tests condition

* Update synapseml

* Update synapseml

* No parent run, no logging for OSS

* Log when autolog is enabled

* upgrade code

* Enable autolog for tune

* Increase time budget for test

* End run before start a new run

* Update parent run

* Fix import error

* clean up

* skip macos and win

* Update notes

* Update default value of model_history
  • Loading branch information
thinkall authored Aug 12, 2024
1 parent bd34b4e commit 635dfbd
Show file tree
Hide file tree
Showing 22 changed files with 3,145 additions and 317 deletions.
16 changes: 8 additions & 8 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,15 @@ jobs:
pip install -e .
python -c "import flaml"
pip install -e .[test]
- name: On Ubuntu python 3.8, install pyspark 3.2.3
if: matrix.python-version == '3.8' && matrix.os == 'ubuntu-latest'
- name: On Ubuntu python 3.10, install pyspark 3.4.1
if: matrix.python-version == '3.10' && matrix.os == 'ubuntu-latest'
run: |
pip install pyspark==3.2.3
pip install pyspark==3.4.1
pip list | grep "pyspark"
- name: On Ubuntu python 3.11, install pyspark 3.5.1
if: matrix.python-version == '3.11' && matrix.os == 'ubuntu-latest'
run: |
pip install pyspark==3.5.1
pip list | grep "pyspark"
- name: If linux and python<3.11, install ray 2
if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.11'
Expand All @@ -77,11 +82,6 @@ jobs:
if: matrix.python-version == '3.8' || matrix.python-version == '3.9'
run: |
pip install -e .[vw]
- name: Uninstall pyspark on (python 3.9) or windows
if: matrix.python-version == '3.9' || matrix.os == 'windows-2019'
run: |
# Uninstall pyspark to test env without pyspark
pip uninstall -y pyspark
- name: Test with pytest
if: matrix.python-version != '3.10'
run: |
Expand Down
18 changes: 18 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,24 @@ output/
flaml/tune/spark/mylearner.py
*.pkl

data/
benchmark/pmlb/csv_datasets
benchmark/*.csv

checkpoints/
test/default
test/housing.json
test/nlp/default/transformer_ms/seq-classification.json

flaml/fabric/fanova/_fanova.c
# local config files
*.config.local

local_debug/
patch.diff

# Test things
notebook/lightning_logs/
lightning_logs/
flaml/autogen/extensions/tmp/
test/autogen/my_tmp/
10 changes: 9 additions & 1 deletion flaml/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
import logging

from flaml.automl import AutoML, logger_formatter
try:
from flaml.automl import AutoML, logger_formatter

has_automl = True
except ImportError:
has_automl = False
from flaml.onlineml.autovw import AutoVW
from flaml.tune.searcher import CFO, FLOW2, BlendSearch, BlendSearchTuner, RandomSearch
from flaml.version import __version__

# Set the root logger.
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

if not has_automl:
logger.warning("flaml.automl is not available. Please install flaml[automl] to enable AutoML functionalities.")
187 changes: 138 additions & 49 deletions flaml/automl/automl.py

Large diffs are not rendered by default.

26 changes: 24 additions & 2 deletions flaml/automl/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from flaml.automl.spark import ERROR as SPARK_ERROR
from flaml.automl.spark import DataFrame, Series, psDataFrame, psSeries
from flaml.automl.task.task import Task
from flaml.automl.time_series import TimeSeriesDataset

try:
from sklearn.metrics import (
Expand All @@ -33,7 +34,6 @@
if SPARK_ERROR is None:
from flaml.automl.spark.metrics import spark_metric_loss_score

from flaml.automl.time_series import TimeSeriesDataset

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -89,6 +89,11 @@
"wer": "min",
}
huggingface_submetric_to_metric = {"rouge1": "rouge", "rouge2": "rouge"}
spark_metric_name_dict = {
"Regression": ["r2", "rmse", "mse", "mae", "var"],
"Binary Classification": ["pr_auc", "roc_auc"],
"Multi-class Classification": ["accuracy", "log_loss", "f1", "micro_f1", "macro_f1"],
}


def metric_loss_score(
Expand Down Expand Up @@ -122,7 +127,7 @@ def metric_loss_score(
import datasets

datasets_metric_name = huggingface_submetric_to_metric.get(metric_name, metric_name.split(":")[0])
metric = datasets.load_metric(datasets_metric_name)
metric = datasets.load_metric(datasets_metric_name, trust_remote_code=True)
metric_mode = huggingface_metric_to_mode[datasets_metric_name]

if metric_name.startswith("seqeval"):
Expand Down Expand Up @@ -334,6 +339,14 @@ def compute_estimator(
if fit_kwargs is None:
fit_kwargs = {}

fe_params = {}
for param, value in config_dic.items():
if param.startswith("fe."):
fe_params[param] = value

for param, value in fe_params.items():
config_dic.pop(param)

estimator_class = estimator_class or task.estimator_class_from_str(estimator_name)
estimator = estimator_class(
**config_dic,
Expand Down Expand Up @@ -401,12 +414,21 @@ def train_estimator(
free_mem_ratio=0,
) -> Tuple[EstimatorSubclass, float]:
start_time = time.time()
fe_params = {}
for param, value in config_dic.items():
if param.startswith("fe."):
fe_params[param] = value

for param, value in fe_params.items():
config_dic.pop(param)

estimator_class = estimator_class or task.estimator_class_from_str(estimator_name)
estimator = estimator_class(
**config_dic,
task=task,
n_jobs=n_jobs,
)

if fit_kwargs is None:
fit_kwargs = {}

Expand Down
Loading

0 comments on commit 635dfbd

Please sign in to comment.