Tabular: HPO bag fix (open-mmlab#912)

* Tabular: Fixed time usage in HPO, reduced memory usage of HPO, improved naming of HPO models. * minor update * minor fix
FANGAreNotGnu · Feb 8, 2021 · 6ef7d70 · 6ef7d70
1 parent 14f8894
commit 6ef7d70
Show file tree

Hide file tree

Showing 9 changed files with 66 additions and 51 deletions.
diff --git a/core/src/autogluon/core/models/abstract/abstract_model.py b/core/src/autogluon/core/models/abstract/abstract_model.py
@@ -192,6 +192,7 @@ def _set_default_searchspace(self):
         if self.params is not None:
             self.params.update(def_search_space)
 
+    # TODO: v0.1 Change this to update path_root only, path change to property
     def set_contexts(self, path_context):
         self.path = self.create_contexts(path_context)
         self.path_root = self.path.rsplit(self.path_suffix, 1)[0]
@@ -609,7 +610,7 @@ def _get_hpo_results(self, scheduler, scheduler_params: dict, time_start):
         hpo_model_performances = {}
         for trial in sorted(hpo_results['trial_info'].keys()):
             # TODO: ignore models which were killed early by scheduler (eg. in Hyperband). How to ID these?
-            file_id = "trial_" + str(trial)  # unique identifier to files from this trial
+            file_id = f"T{trial}"  # unique identifier to files from this trial
             trial_model_name = self.name + os.path.sep + file_id
             trial_model_path = self.path_root + trial_model_name + os.path.sep
             hpo_models[trial_model_name] = trial_model_path

diff --git a/core/src/autogluon/core/models/abstract/model_trial.py b/core/src/autogluon/core/models/abstract/model_trial.py
@@ -29,8 +29,16 @@ def model_trial(args, reporter: LocalStatusReporter):
 
         fit_model_args = dict(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **util_args.get('fit_kwargs', dict()))
         predict_proba_args = dict(X=X_val)
-        model = fit_and_save_model(model=model, params=args, fit_args=fit_model_args, predict_proba_args=predict_proba_args, y_val=y_val,
-                                   time_start=util_args.time_start, time_limit=util_args.get('time_limit', None), reporter=None)
+        model = fit_and_save_model(
+            model=model,
+            params=args,
+            fit_args=fit_model_args,
+            predict_proba_args=predict_proba_args,
+            y_val=y_val,
+            time_start=util_args.time_start,
+            time_limit=util_args.get('time_limit', None),
+            reporter=None,
+        )
     except Exception as e:
         if not isinstance(e, TimeLimitExceeded):
             logger.exception(e, exc_info=True)
@@ -43,7 +51,7 @@ def prepare_inputs(args):
     task_id = args.pop('task_id')
     util_args = args.pop('util_args')
 
-    file_prefix = f"trial_{task_id}"  # append to all file names created during this trial. Do NOT change!
+    file_prefix = f"T{task_id}"  # append to all file names created during this trial. Do NOT change!
     model = util_args.model  # the model object must be passed into model_trial() here
     model.name = model.name + os.path.sep + file_prefix
     model.set_contexts(path_context=model.path_root + model.name + os.path.sep)

diff --git a/core/src/autogluon/core/models/ensemble/bagged_ensemble_model.py b/core/src/autogluon/core/models/ensemble/bagged_ensemble_model.py
@@ -125,6 +125,7 @@ def _fit(self, X_train, y_train, k_fold=5, k_fold_start=0, k_fold_end=None, n_re
         time_start = time.time()
 
         model_base = self._get_model_base()
+        model_base.rename(name='')
         if self.features is not None:
             model_base.features = self.features
         model_base.feature_metadata = self.feature_metadata  # TODO: Don't pass this here
@@ -136,6 +137,7 @@ def _fit(self, X_train, y_train, k_fold=5, k_fold_start=0, k_fold_end=None, n_re
         if k_fold == 1:
             if self._n_repeats != 0:
                 raise ValueError(f'n_repeats must equal 0 when fitting a single model with k_fold < 2, values: ({self._n_repeats}, {k_fold})')
+            model_base.name = f'{model_base.name}S1F1'
             model_base.set_contexts(path_context=self.path + model_base.name + os.path.sep)
             time_start_fit = time.time()
             model_base.fit(X_train=X_train, y_train=y_train, time_limit=time_limit, **kwargs)
@@ -171,6 +173,7 @@ def _fit(self, X_train, y_train, k_fold=5, k_fold_start=0, k_fold_end=None, n_re
             fold_end_n_repeat = min(fold_start_n_repeat + k_fold, fold_end)
             # TODO: Consider moving model fit inner for loop to a function to simply this code
             for i in range(fold_start_n_repeat, fold_end_n_repeat):  # For each fold
+                fold_num_in_repeat = i - (j * k_fold)  # The fold in the current repeat set (first fold in set = 0)
                 folds_finished = i - fold_start
                 folds_left = fold_end - i
                 fold = kfolds[i]
@@ -194,7 +197,7 @@ def _fit(self, X_train, y_train, k_fold=5, k_fold_start=0, k_fold_end=None, n_re
                 X_train_fold, X_val_fold = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
                 y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
                 fold_model = copy.deepcopy(model_base)
-                fold_model.name = f'{fold_model.name}_F{i+1}'
+                fold_model.name = f'{fold_model.name}S{j+1}F{fold_num_in_repeat+1}'  # S5F3 = 3rd fold of the 5th repeat set
                 fold_model.set_contexts(self.path + fold_model.name + os.path.sep)
                 fold_model.fit(X_train=X_train_fold, y_train=y_train_fold, X_val=X_val_fold, y_val=y_val_fold, time_limit=time_limit_fold, **kwargs)
                 time_train_end_fold = time.time()
@@ -350,14 +353,8 @@ def save_child(self, model, verbose=False):
     def convert_to_refit_full_template(self):
         init_args = self._get_init_args()
         init_args['hyperparameters']['save_bag_folds'] = True  # refit full models must save folds
-        model_base_name_orig = init_args['model_base'].name
         init_args['model_base'] = self.convert_to_refitfull_template_child()
-        model_base_name_new = init_args['model_base'].name
-        if model_base_name_orig in init_args['name'] and model_base_name_orig != model_base_name_new:
-            init_args['name'] = init_args['name'].replace(model_base_name_orig, model_base_name_new, 1)
-        else:
-            init_args['name'] = init_args['name'] + '_FULL'
-
+        init_args['name'] = init_args['name'] + REFIT_FULL_SUFFIX
         model_full_template = self.__class__(**init_args)
         return model_full_template
 
@@ -366,8 +363,6 @@ def convert_to_refitfull_template_child(self):
         child_compressed = copy.deepcopy(self._get_model_base())
         child_compressed.feature_metadata = self.feature_metadata  # TODO: Don't pass this here
         child_compressed.params = compressed_params
-        child_compressed.name = child_compressed.name + REFIT_FULL_SUFFIX
-        child_compressed.set_contexts(self.path_root + child_compressed.name + os.path.sep)
         return child_compressed
 
     def _get_init_args(self):
@@ -637,14 +632,15 @@ def _hyperparameter_tune(self, X_train, y_train, k_fold, scheduler_options, prep
 
             # TODO: Create new Ensemble Here
             bag = copy.deepcopy(self)
-            bag.name = bag.name + os.path.sep + str(i)
+            bag.rename(f"{bag.name}{os.path.sep}T{i}")
             bag.set_contexts(self.path_root + bag.name + os.path.sep)
 
             oof_pred_proba, oof_pred_model_repeats = self._construct_empty_oof(X=X_train, y=y_train)
             oof_pred_proba[test_index] += y_pred_proba
             oof_pred_model_repeats[test_index] += 1
 
             bag.model_base = None
+            child.rename('')
             child.set_contexts(bag.path + child.name + os.path.sep)
             bag.save_model_base(child.convert_to_template())
 
@@ -653,7 +649,7 @@ def _hyperparameter_tune(self, X_train, y_train, k_fold, scheduler_options, prep
             bag._n_repeats = 1
             bag._oof_pred_proba = oof_pred_proba
             bag._oof_pred_model_repeats = oof_pred_model_repeats
-            child.name = child.name + '_fold_0'
+            child.rename('S1F1')
             child.set_contexts(bag.path + child.name + os.path.sep)
             if not self.params.get('save_bag_folds', True):
                 child.model = None

diff --git a/core/src/autogluon/core/utils/plots.py b/core/src/autogluon/core/utils/plots.py
@@ -37,6 +37,7 @@ def plot_performance_vs_trials(results, output_directory, save_file="Performance
         print("Plot of HPO performance saved to file: %s" % outputfile)
     plt.show()
 
+
 def plot_summary_of_models(results, output_directory, save_file='SummaryOfModels.html', plot_title="Models produced during fit()"):
     """ Plot dynamic scatterplot summary of each model encountered during fit(), based on the returned Results object. 
     """
@@ -71,6 +72,7 @@ def plot_summary_of_models(results, output_directory, save_file='SummaryOfModels
     if save_path is not None:
         print("Plot summary of models saved to file: %s" % save_file)
 
+
 def plot_tabular_models(results, output_directory=None, save_file="SummaryOfModels.html", plot_title="Models produced during fit()"):
     """ Plot dynamic scatterplot of every single model trained during tabular_prediction.fit()
         Args:
@@ -87,19 +89,15 @@ def plot_tabular_models(results, output_directory=None, save_file="SummaryOfMode
     hidden_keys.append(model_types)
     model_hyperparams = [_formatDict(results['model_hyperparams'][key]) for key in model_names]
     datadict = {'performance': val_perfs, 'model': model_names, 'model_type': model_types, 'hyperparameters': model_hyperparams}
-    hpo_used = results['hyperparameter_tune']
-    if not hpo_used:  # currently, times are only stored without HPO
-        leaderboard = results['leaderboard'].copy()
-        leaderboard['fit_time'] = leaderboard['fit_time'].fillna(0)
-        leaderboard['pred_time_val'] = leaderboard['pred_time_val'].fillna(0)
+    leaderboard = results['leaderboard'].copy()
+    leaderboard['fit_time'] = leaderboard['fit_time'].fillna(0)
+    leaderboard['pred_time_val'] = leaderboard['pred_time_val'].fillna(0)
+
+    datadict['inference_latency'] = [leaderboard['pred_time_val'][leaderboard['model'] == m].values[0] for m in model_names]
+    datadict['training_time'] = [leaderboard['fit_time'][leaderboard['model'] == m].values[0] for m in model_names]
+    mousover_plot(datadict, attr_x='inference_latency', attr_y='performance', attr_color='model_type',
+                  save_file=save_path, plot_title=plot_title, hidden_keys=hidden_keys)
 
-        datadict['inference_latency'] = [leaderboard['pred_time_val'][leaderboard['model'] == m].values[0] for m in model_names]
-        datadict['training_time'] = [leaderboard['fit_time'][leaderboard['model'] == m].values[0] for m in model_names]
-        mousover_plot(datadict, attr_x='inference_latency', attr_y='performance', attr_color='model_type', 
-                      save_file=save_path, plot_title=plot_title, hidden_keys=hidden_keys)
-    else:
-        mousover_plot(datadict, attr_x='model_type', attr_y='performance',
-                      save_file=save_path, plot_title=plot_title, hidden_keys=hidden_keys)
 
 def _formatDict(d):
     """ Returns dict as string with HTML new-line tags <br> between key-value pairs. """
@@ -109,6 +107,7 @@ def _formatDict(d):
         s += new_s
     return s[:-4]
 
+
 def mousover_plot(datadict, attr_x, attr_y, attr_color=None, attr_size=None, save_file=None, plot_title="",
                   point_transparency = 0.5, point_size=20, default_color="#2222aa", hidden_keys = []):
     """ Produces dynamic scatter plot that can be interacted with by mousing over each point to see its label
@@ -216,4 +215,3 @@ def mousover_plot(datadict, attr_x, attr_y, attr_color=None, attr_size=None, sav
         p.add_layout(Legend(items=[LegendItem(label='Size of points based on "'+attr_size + '"')]), 'below')
 
     show(p)
-
diff --git a/tabular/src/autogluon/tabular/models/catboost/catboost_model.py b/tabular/src/autogluon/tabular/models/catboost/catboost_model.py
@@ -265,7 +265,6 @@ def _fit(self, X_train, y_train, X_val=None, y_val=None, time_limit=None, num_gp
                     self.model = init_model
                 else:
                     if (init_model_best_score > self.stopping_metric._optimum) or (final_model_best_score > self.stopping_metric._optimum):
-                        logger.warning(f'Warning: Sign differs between AG metric and CatBoost metric variants: {self.stopping_metric.name}, flipping signs.')
                         init_model_best_score = -init_model_best_score
                         final_model_best_score = -final_model_best_score
 

diff --git a/tabular/src/autogluon/tabular/models/lgb/hyperparameters/lgb_trial.py b/tabular/src/autogluon/tabular/models/lgb/hyperparameters/lgb_trial.py
@@ -25,14 +25,27 @@ def lgb_trial(args, reporter):
         dataset_val = lgb.Dataset(util_args.directory + util_args.dataset_val_filename)
         X_val, y_val = load_pkl.load(util_args.directory + util_args.dataset_val_pkl_filename)
 
+        reporter_fit = None  # Set reporter_fit to reporter for per-iteration reporting, but will take up MUCH more space (can quickly lead to OOM).
+
         fit_model_args = dict(dataset_train=dataset_train, dataset_val=dataset_val, **util_args.get('fit_kwargs', dict()))
         predict_proba_args = dict(X=X_val)
-        model_trial.fit_and_save_model(model=model, params=args, fit_args=fit_model_args, predict_proba_args=predict_proba_args, y_val=y_val,
-                                       time_start=util_args.time_start, time_limit=util_args.get('time_limit', None), reporter=reporter)
+        model_trial.fit_and_save_model(
+            model=model,
+            params=args,
+            fit_args=fit_model_args,
+            predict_proba_args=predict_proba_args,
+            y_val=y_val,
+            time_start=util_args.time_start,
+            time_limit=util_args.get('time_limit', None),
+            reporter=reporter_fit,
+        )
     except Exception as e:
         if not isinstance(e, TimeLimitExceeded):
             logger.exception(e, exc_info=True)
         reporter.terminate()
+    else:
+        if reporter_fit is None:
+            reporter(epoch=1, validation_performance=model.val_score)
 
     # FIXME: If stopping metric and eval metric differ, the previous reported scores will not align as they will be evaluated with stopping_metric, whereas this is evaluated with eval_metric
     #  This should only impact if the reporter data is used

diff --git a/tabular/src/autogluon/tabular/task/tabular_prediction/predictor_legacy.py b/tabular/src/autogluon/tabular/task/tabular_prediction/predictor_legacy.py
@@ -316,7 +316,8 @@ def fit_summary(self, verbosity=3):
         -------
         Dict containing various detailed information. We do not recommend directly printing this dict as it may be very large.
         """
-        hpo_used = len(self._trainer.hpo_results) > 0
+        # hpo_used = len(self._trainer.hpo_results) > 0
+        hpo_used = False  # Disabled until a more memory efficient hpo_results object is implemented.
         model_types = self._trainer.get_models_attribute_dict(attribute='type')
         model_inner_types = self._trainer.get_models_attribute_dict(attribute='type_inner')
         model_typenames = {key: model_types[key].__name__ for key in model_types}
@@ -341,13 +342,11 @@ def fit_summary(self, verbosity=3):
             'model_pred_times': self._trainer.get_models_attribute_dict('predict_time'),
             'num_bag_folds': self._trainer.k_fold,
             'max_stack_level': self._trainer.get_max_level(),
-            'feature_prune': self._trainer.feature_prune,
-            'hyperparameter_tune': hpo_used,
         }
         if self.problem_type != REGRESSION:
             results['num_classes'] = self._trainer.num_classes
-        if hpo_used:
-            results['hpo_results'] = self._trainer.hpo_results
+        # if hpo_used:
+        #     results['hpo_results'] = self._trainer.hpo_results
         # get dict mapping model name to final hyperparameter values for each model:
         model_hyperparams = {}
         for model_name in self._trainer.get_model_names():
@@ -376,9 +375,9 @@ def fit_summary(self, verbosity=3):
                 num_stack_str = f" (with {results['max_stack_level']} levels)"
             print("Multi-layer stack-ensembling used: %s %s" % (stacking_used, num_stack_str))
             hpo_str = ""
-            if hpo_used and verbosity <= 2:
-                hpo_str = " (call fit_summary() with verbosity >= 3 to see detailed HPO info)"
-            print("Hyperparameter-tuning used: %s %s" % (hpo_used, hpo_str))
+            # if hpo_used and verbosity <= 2:
+            #     hpo_str = " (call fit_summary() with verbosity >= 3 to see detailed HPO info)"
+            # print("Hyperparameter-tuning used: %s %s" % (hpo_used, hpo_str))
             # TODO: uncomment once feature_prune is functional:  self._summarize('feature_prune', 'feature-selection used', results)
             print("Feature Metadata (Processed):")
             print("(raw dtype, special dtypes):")

diff --git a/tabular/src/autogluon/tabular/trainer/abstract_trainer.py b/tabular/src/autogluon/tabular/trainer/abstract_trainer.py
@@ -75,8 +75,6 @@ def __init__(self, path: str, problem_type: str, eval_metric=None,
         self._model_full_dict_val_score = {}  # Dict of FULL model -> normal model validation score in case the normal model had been deleted.
         self.reset_paths = False
 
-        self.hpo_results = {}  # Stores summary of HPO process
-
         self._time_limit = None  # Internal float of the total time limit allowed for a given fit call. Used in logging statements.
         self._time_train_start = None  # Internal timestamp of the time training started for a given fit call. Used in logging statements.
 
@@ -1029,7 +1027,8 @@ def _train_single_full(self, X_train, y_train, model: AbstractModel, X_unlabeled
                 del model
                 model_names_trained = []
             else:
-                self.hpo_results[model.name] = hpo_results
+                # Commented out because it takes too much space (>>5 GB if run for an hour on a small-medium sized dataset)
+                # self.hpo_results[model.name] = hpo_results
                 model_names_trained = []
                 for model_hpo_name, model_path in hpo_models.items():
                     model_hpo = self.load_model(model_hpo_name, path=model_path, model_type=type(model))
@@ -1117,18 +1116,19 @@ def _train_multi_initial(self, X_train, y_train, models: List[AbstractModel], k_
                     hpo_enabled = True
                     break
 
+        hpo_time_ratio = 0.9
         if hpo_enabled:
             time_split = True
         else:
             time_split = False
         if k_fold == 0:
-            time_ratio = 0.9 if hpo_enabled else 1
+            time_ratio = hpo_time_ratio if hpo_enabled else 1
             models = self._train_multi_fold(models=models, hyperparameter_tune_kwargs=hyperparameter_tune_kwargs, feature_prune=feature_prune, time_limit=time_limit, time_split=time_split, time_ratio=time_ratio, **fit_args)
         else:
             k_fold_start = 0
             if hpo_enabled or feature_prune:
                 time_start = time.time()
-                time_ratio = (1 - (1 / k_fold)) * 0.9
+                time_ratio = (1 / k_fold) * hpo_time_ratio
                 models = self._train_multi_fold(models=models, hyperparameter_tune_kwargs=hyperparameter_tune_kwargs, feature_prune=feature_prune,
                                                 k_fold_start=0, k_fold_end=1, n_repeats=n_repeats, n_repeat_start=0, time_limit=time_limit, time_split=time_split, time_ratio=time_ratio, **fit_args)
                 k_fold_start = 1

diff --git a/tabular/src/autogluon/tabular/trainer/model_presets/presets.py b/tabular/src/autogluon/tabular/trainer/model_presets/presets.py
@@ -264,21 +264,22 @@ def model_factory(
         name_prefix = model[AG_ARGS].get('name_prefix', '')
         name_suff = model[AG_ARGS].get('name_suffix', '')
         name_orig = name_prefix + name_main + name_suff
-    if name_suffix is not None:
-        name_orig = name_orig + name_suffix
-    name = name_orig
     name_stacker = None
     num_increment = 2
+    if name_suffix is None:
+        name_suffix = ''
     if ensemble_kwargs is None:
+        name = f'{name_orig}{name_suffix}'
         while name in invalid_name_set:  # Ensure name is unique
-            name = f'{name_orig}_{num_increment}'
+            name = f'{name_orig}_{num_increment}{name_suffix}'
             num_increment += 1
     else:
+        name = name_orig
         name_bag_suffix = model[AG_ARGS].get('name_bag_suffix', '_BAG')
-        name_stacker = f'{name}{name_bag_suffix}_L{level}'
+        name_stacker = f'{name}{name_bag_suffix}_L{level}{name_suffix}'
         while name_stacker in invalid_name_set:  # Ensure name is unique
             name = f'{name_orig}_{num_increment}'
-            name_stacker = f'{name}{name_bag_suffix}_L{level}'
+            name_stacker = f'{name}{name_bag_suffix}_L{level}{name_suffix}'
             num_increment += 1
     model_params = copy.deepcopy(model)
     model_params.pop(AG_ARGS, None)