Improve annotations in automl and ml modules (microsoft#919)

* begin annotation in automl.py and ml.py * EstimatorSubclass + annotate metric * review: fixes + setting fit_kwargs as proper Optional * import from flaml.automl.model (import from flaml.model is deprecated) * comment n_jobs in train_estimator as well * better annotation in _compute_with_config_base Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu> --------- Co-authored-by: Andrea W <a.ruggerini@ammagamma.com> Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu>
whiskyboy · Feb 22, 2023 · 8263310 · 8263310
1 parent 090af78
commit 8263310
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 39 deletions.
diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py
@@ -2,6 +2,7 @@
 #  * Copyright (c) FLAML authors. All rights reserved.
 #  * Licensed under the MIT License. See LICENSE file in the
 #  * project root for license information.
+from __future__ import annotations
 import time
 import os
 import sys
@@ -306,7 +307,7 @@ def est_retrain_time(self, retrain_sample_size):
 
 
 class AutoMLState:
-    def _prepare_sample_train_data(self, sample_size):
+    def _prepare_sample_train_data(self, sample_size: int):
         sampled_weight = groups = None
         if sample_size <= self.data_size[0]:
             if isinstance(self.X_train, pd.DataFrame):
@@ -344,7 +345,9 @@ def _prepare_sample_train_data(self, sample_size):
         return sampled_X_train, sampled_y_train, sampled_weight, groups
 
     @staticmethod
-    def _compute_with_config_base(config_w_resource, state, estimator, is_report=True):
+    def _compute_with_config_base(
+        config_w_resource: dict, state: AutoMLState, estimator: str, is_report: bool = True
+    ) -> dict:
         if "FLAML_sample_size" in config_w_resource:
             sample_size = int(config_w_resource["FLAML_sample_size"])
         else:
@@ -435,9 +438,9 @@ def sanitize(cls, config: dict) -> dict:
 
     def _train_with_config(
         self,
-        estimator,
-        config_w_resource,
-        sample_size=None,
+        estimator: str,
+        config_w_resource: dict,
+        sample_size: Optional[int] = None,
     ):
         if not sample_size:
             sample_size = config_w_resource.get(
@@ -801,11 +804,11 @@ def custom_metric(
             "classifier" if settings["task"] in CLASSIFICATION else "regressor"
         )
 
-    def get_params(self, deep=False):
+    def get_params(self, deep: bool = False) -> dict:
         return self._settings.copy()
 
     @property
-    def config_history(self):
+    def config_history(self) -> dict:
         """A dictionary of iter->(estimator, config, time),
         storing the best estimator, config, and the time when the best
         model is updated each time.
@@ -819,7 +822,7 @@ def model(self):
         """
         return self.__dict__.get("_trained_estimator")
 
-    def best_model_for_estimator(self, estimator_name):
+    def best_model_for_estimator(self, estimator_name: str):
         """Return the best model found for a particular estimator.
 
         Args:
@@ -1587,7 +1590,7 @@ def add_learner(self, learner_name, learner_class):
         """
         self._state.learner_classes[learner_name] = learner_class
 
-    def get_estimator_from_log(self, log_file_name, record_id, task):
+    def get_estimator_from_log(self, log_file_name: str, record_id: int, task: str):
         """Get the estimator from log file.
 
         Args:

diff --git a/flaml/automl/ml.py b/flaml/automl/ml.py
@@ -5,6 +5,8 @@
 import time
 import numpy as np
 import pandas as pd
+from typing import Union, Callable, TypeVar, Optional, Tuple
+
 from sklearn.metrics import (
     mean_squared_error,
     r2_score,
@@ -46,9 +48,11 @@
     TransformersEstimatorModelSelection,
 )
 from flaml.automl.data import CLASSIFICATION, group_counts, TS_FORECAST
+from flaml.automl.model import BaseEstimator
 import logging
 
 logger = logging.getLogger(__name__)
+EstimatorSubclass = TypeVar("EstimatorSubclass", bound=BaseEstimator)
 
 sklearn_metric_name_set = {
     "r2",
@@ -101,7 +105,12 @@
 huggingface_submetric_to_metric = {"rouge1": "rouge", "rouge2": "rouge"}
 
 
-def get_estimator_class(task, estimator_name):
+def get_estimator_class(task: str, estimator_name: str) -> EstimatorSubclass:
+    """Given a task and an estimator name, return the relevant flaml-wrapped estimator class
+
+    NOTE: See why the return type is declarad by using TypeVar here on the mypy doc
+    https://mypy.readthedocs.io/en/stable/kinds_of_types.html#the-type-of-class-objects
+    """
     # when adding a new learner, need to add an elif branch
     if "xgboost" == estimator_name:
         estimator_class = XGBoost_TS if task in TS_FORECAST else XGBoostSklearnEstimator
@@ -144,7 +153,7 @@ def get_estimator_class(task, estimator_name):
 
 
 def metric_loss_score(
-    metric_name,
+    metric_name: str,
     y_processed_predict,
     y_processed_true,
     labels=None,
@@ -223,19 +232,19 @@ def metric_loss_score(
             return score
 
 
-def is_in_sklearn_metric_name_set(metric_name):
+def is_in_sklearn_metric_name_set(metric_name: str):
     return metric_name.startswith("ndcg") or metric_name in sklearn_metric_name_set
 
 
-def is_min_metric(metric_name):
+def is_min_metric(metric_name: str):
     return (
         metric_name in ["rmse", "mae", "mse", "log_loss", "mape"]
         or huggingface_metric_to_mode.get(metric_name, None) == "min"
     )
 
 
 def sklearn_metric_loss_score(
-    metric_name,
+    metric_name: str,
     y_predict,
     y_true,
     labels=None,
@@ -372,7 +381,7 @@ def _eval_estimator(
     y_val,
     weight_val,
     groups_val,
-    eval_metric,
+    eval_metric: Union[str, Callable],
     obj,
     labels=None,
     log_training_metric=False,
@@ -424,14 +433,14 @@ def _eval_estimator(
 
 def get_val_loss(
     config,
-    estimator,
+    estimator: EstimatorSubclass,
     X_train,
     y_train,
     X_val,
     y_val,
     weight_val,
     groups_val,
-    eval_metric,
+    eval_metric: Union[str, Callable],
     obj,
     labels=None,
     budget=None,
@@ -487,13 +496,13 @@ def default_cv_score_agg_func(val_loss_folds, log_metrics_folds):
 
 
 def evaluate_model_CV(
-    config,
-    estimator,
+    config: dict,
+    estimator: EstimatorSubclass,
     X_train_all,
     y_train_all,
     budget,
     kf,
-    task,
+    task: str,
     eval_metric,
     best_val_loss,
     cv_score_agg_func=None,
@@ -607,19 +616,24 @@ def compute_estimator(
     groups_val,
     budget,
     kf,
-    config_dic,
-    task,
-    estimator_name,
-    eval_method,
-    eval_metric,
+    config_dic: dict,
+    task: str,
+    estimator_name: str,
+    eval_method: str,
+    eval_metric: Union[str, Callable],
     best_val_loss=np.Inf,
-    n_jobs=1,
-    estimator_class=None,
-    cv_score_agg_func=None,
-    log_training_metric=False,
-    fit_kwargs={},
+    n_jobs: Optional[
+        int
+    ] = 1,  # some estimators of EstimatorSubclass don't accept n_jobs. Should be None in that case.
+    estimator_class: Optional[EstimatorSubclass] = None,
+    cv_score_agg_func: Optional[callable] = None,
+    log_training_metric: Optional[bool] = False,
+    fit_kwargs: Optional[dict] = None,
     free_mem_ratio=0,
 ):
+    if not fit_kwargs:
+        fit_kwargs = {}
+
     estimator_class = estimator_class or get_estimator_class(task, estimator_name)
     estimator = estimator_class(
         **config_dic,
@@ -677,25 +691,30 @@ def compute_estimator(
 
 
 def train_estimator(
-    config_dic,
+    config_dic: dict,
     X_train,
     y_train,
-    task,
-    estimator_name,
-    n_jobs=1,
-    estimator_class=None,
+    task: str,
+    estimator_name: str,
+    n_jobs: Optional[
+        int
+    ] = 1,  # some estimators of EstimatorSubclass don't accept n_jobs. Should be None in that case.
+    estimator_class: Optional[EstimatorSubclass] = None,
     budget=None,
-    fit_kwargs={},
+    fit_kwargs: Optional[dict] = None,
     eval_metric=None,
     free_mem_ratio=0,
-):
+) -> Tuple[EstimatorSubclass, float]:
     start_time = time.time()
     estimator_class = estimator_class or get_estimator_class(task, estimator_name)
     estimator = estimator_class(
         **config_dic,
         task=task,
         n_jobs=n_jobs,
     )
+    if not fit_kwargs:
+        fit_kwargs = {}
+
     if isinstance(estimator, TransformersEstimator):
         fit_kwargs["metric"] = eval_metric
 
@@ -717,7 +736,9 @@ def get_classification_objective(num_labels: int) -> str:
     return objective_name
 
 
-def norm_confusion_matrix(y_true, y_pred):
+def norm_confusion_matrix(
+    y_true: Union[np.array, pd.Series], y_pred: Union[np.array, pd.Series]
+):
     """normalized confusion matrix.
 
     Args:
@@ -735,7 +756,11 @@ def norm_confusion_matrix(y_true, y_pred):
     return norm_conf_mat
 
 
-def multi_class_curves(y_true, y_pred_proba, curve_func):
+def multi_class_curves(
+    y_true: Union[np.array, pd.Series],
+    y_pred_proba: Union[np.array, pd.Series],
+    curve_func: Callable,
+):
     """Binarize the data for multi-class tasks and produce ROC or precision-recall curves.
 
     Args: