Skip to content

Commit

Permalink
Improve annotations in automl and ml modules (microsoft#919)
Browse files Browse the repository at this point in the history
* begin annotation in automl.py and ml.py

* EstimatorSubclass + annotate metric

* review: fixes + setting fit_kwargs as proper Optional

* import from flaml.automl.model (import from flaml.model is deprecated)

* comment n_jobs in train_estimator as well

* better annotation in _compute_with_config_base

Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu>

---------

Co-authored-by: Andrea W <a.ruggerini@ammagamma.com>
Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu>
  • Loading branch information
3 people authored Feb 22, 2023
1 parent 090af78 commit 8263310
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 39 deletions.
21 changes: 12 additions & 9 deletions flaml/automl/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# * Copyright (c) FLAML authors. All rights reserved.
# * Licensed under the MIT License. See LICENSE file in the
# * project root for license information.
from __future__ import annotations
import time
import os
import sys
Expand Down Expand Up @@ -306,7 +307,7 @@ def est_retrain_time(self, retrain_sample_size):


class AutoMLState:
def _prepare_sample_train_data(self, sample_size):
def _prepare_sample_train_data(self, sample_size: int):
sampled_weight = groups = None
if sample_size <= self.data_size[0]:
if isinstance(self.X_train, pd.DataFrame):
Expand Down Expand Up @@ -344,7 +345,9 @@ def _prepare_sample_train_data(self, sample_size):
return sampled_X_train, sampled_y_train, sampled_weight, groups

@staticmethod
def _compute_with_config_base(config_w_resource, state, estimator, is_report=True):
def _compute_with_config_base(
config_w_resource: dict, state: AutoMLState, estimator: str, is_report: bool = True
) -> dict:
if "FLAML_sample_size" in config_w_resource:
sample_size = int(config_w_resource["FLAML_sample_size"])
else:
Expand Down Expand Up @@ -435,9 +438,9 @@ def sanitize(cls, config: dict) -> dict:

def _train_with_config(
self,
estimator,
config_w_resource,
sample_size=None,
estimator: str,
config_w_resource: dict,
sample_size: Optional[int] = None,
):
if not sample_size:
sample_size = config_w_resource.get(
Expand Down Expand Up @@ -801,11 +804,11 @@ def custom_metric(
"classifier" if settings["task"] in CLASSIFICATION else "regressor"
)

def get_params(self, deep=False):
def get_params(self, deep: bool = False) -> dict:
return self._settings.copy()

@property
def config_history(self):
def config_history(self) -> dict:
"""A dictionary of iter->(estimator, config, time),
storing the best estimator, config, and the time when the best
model is updated each time.
Expand All @@ -819,7 +822,7 @@ def model(self):
"""
return self.__dict__.get("_trained_estimator")

def best_model_for_estimator(self, estimator_name):
def best_model_for_estimator(self, estimator_name: str):
"""Return the best model found for a particular estimator.
Args:
Expand Down Expand Up @@ -1587,7 +1590,7 @@ def add_learner(self, learner_name, learner_class):
"""
self._state.learner_classes[learner_name] = learner_class

def get_estimator_from_log(self, log_file_name, record_id, task):
def get_estimator_from_log(self, log_file_name: str, record_id: int, task: str):
"""Get the estimator from log file.
Args:
Expand Down
85 changes: 55 additions & 30 deletions flaml/automl/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import time
import numpy as np
import pandas as pd
from typing import Union, Callable, TypeVar, Optional, Tuple

from sklearn.metrics import (
mean_squared_error,
r2_score,
Expand Down Expand Up @@ -46,9 +48,11 @@
TransformersEstimatorModelSelection,
)
from flaml.automl.data import CLASSIFICATION, group_counts, TS_FORECAST
from flaml.automl.model import BaseEstimator
import logging

logger = logging.getLogger(__name__)
EstimatorSubclass = TypeVar("EstimatorSubclass", bound=BaseEstimator)

sklearn_metric_name_set = {
"r2",
Expand Down Expand Up @@ -101,7 +105,12 @@
huggingface_submetric_to_metric = {"rouge1": "rouge", "rouge2": "rouge"}


def get_estimator_class(task, estimator_name):
def get_estimator_class(task: str, estimator_name: str) -> EstimatorSubclass:
"""Given a task and an estimator name, return the relevant flaml-wrapped estimator class
NOTE: See why the return type is declarad by using TypeVar here on the mypy doc
https://mypy.readthedocs.io/en/stable/kinds_of_types.html#the-type-of-class-objects
"""
# when adding a new learner, need to add an elif branch
if "xgboost" == estimator_name:
estimator_class = XGBoost_TS if task in TS_FORECAST else XGBoostSklearnEstimator
Expand Down Expand Up @@ -144,7 +153,7 @@ def get_estimator_class(task, estimator_name):


def metric_loss_score(
metric_name,
metric_name: str,
y_processed_predict,
y_processed_true,
labels=None,
Expand Down Expand Up @@ -223,19 +232,19 @@ def metric_loss_score(
return score


def is_in_sklearn_metric_name_set(metric_name):
def is_in_sklearn_metric_name_set(metric_name: str):
return metric_name.startswith("ndcg") or metric_name in sklearn_metric_name_set


def is_min_metric(metric_name):
def is_min_metric(metric_name: str):
return (
metric_name in ["rmse", "mae", "mse", "log_loss", "mape"]
or huggingface_metric_to_mode.get(metric_name, None) == "min"
)


def sklearn_metric_loss_score(
metric_name,
metric_name: str,
y_predict,
y_true,
labels=None,
Expand Down Expand Up @@ -372,7 +381,7 @@ def _eval_estimator(
y_val,
weight_val,
groups_val,
eval_metric,
eval_metric: Union[str, Callable],
obj,
labels=None,
log_training_metric=False,
Expand Down Expand Up @@ -424,14 +433,14 @@ def _eval_estimator(

def get_val_loss(
config,
estimator,
estimator: EstimatorSubclass,
X_train,
y_train,
X_val,
y_val,
weight_val,
groups_val,
eval_metric,
eval_metric: Union[str, Callable],
obj,
labels=None,
budget=None,
Expand Down Expand Up @@ -487,13 +496,13 @@ def default_cv_score_agg_func(val_loss_folds, log_metrics_folds):


def evaluate_model_CV(
config,
estimator,
config: dict,
estimator: EstimatorSubclass,
X_train_all,
y_train_all,
budget,
kf,
task,
task: str,
eval_metric,
best_val_loss,
cv_score_agg_func=None,
Expand Down Expand Up @@ -607,19 +616,24 @@ def compute_estimator(
groups_val,
budget,
kf,
config_dic,
task,
estimator_name,
eval_method,
eval_metric,
config_dic: dict,
task: str,
estimator_name: str,
eval_method: str,
eval_metric: Union[str, Callable],
best_val_loss=np.Inf,
n_jobs=1,
estimator_class=None,
cv_score_agg_func=None,
log_training_metric=False,
fit_kwargs={},
n_jobs: Optional[
int
] = 1, # some estimators of EstimatorSubclass don't accept n_jobs. Should be None in that case.
estimator_class: Optional[EstimatorSubclass] = None,
cv_score_agg_func: Optional[callable] = None,
log_training_metric: Optional[bool] = False,
fit_kwargs: Optional[dict] = None,
free_mem_ratio=0,
):
if not fit_kwargs:
fit_kwargs = {}

estimator_class = estimator_class or get_estimator_class(task, estimator_name)
estimator = estimator_class(
**config_dic,
Expand Down Expand Up @@ -677,25 +691,30 @@ def compute_estimator(


def train_estimator(
config_dic,
config_dic: dict,
X_train,
y_train,
task,
estimator_name,
n_jobs=1,
estimator_class=None,
task: str,
estimator_name: str,
n_jobs: Optional[
int
] = 1, # some estimators of EstimatorSubclass don't accept n_jobs. Should be None in that case.
estimator_class: Optional[EstimatorSubclass] = None,
budget=None,
fit_kwargs={},
fit_kwargs: Optional[dict] = None,
eval_metric=None,
free_mem_ratio=0,
):
) -> Tuple[EstimatorSubclass, float]:
start_time = time.time()
estimator_class = estimator_class or get_estimator_class(task, estimator_name)
estimator = estimator_class(
**config_dic,
task=task,
n_jobs=n_jobs,
)
if not fit_kwargs:
fit_kwargs = {}

if isinstance(estimator, TransformersEstimator):
fit_kwargs["metric"] = eval_metric

Expand All @@ -717,7 +736,9 @@ def get_classification_objective(num_labels: int) -> str:
return objective_name


def norm_confusion_matrix(y_true, y_pred):
def norm_confusion_matrix(
y_true: Union[np.array, pd.Series], y_pred: Union[np.array, pd.Series]
):
"""normalized confusion matrix.
Args:
Expand All @@ -735,7 +756,11 @@ def norm_confusion_matrix(y_true, y_pred):
return norm_conf_mat


def multi_class_curves(y_true, y_pred_proba, curve_func):
def multi_class_curves(
y_true: Union[np.array, pd.Series],
y_pred_proba: Union[np.array, pd.Series],
curve_func: Callable,
):
"""Binarize the data for multi-class tasks and produce ROC or precision-recall curves.
Args:
Expand Down

0 comments on commit 8263310

Please sign in to comment.