openvinotoolkit · eunwoosh · May 12, 2023 · Apr 27, 2023 · Apr 28, 2023 · May 2, 2023
@@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file.
 - Action task supports multi GPU training. (<https://github.com/openvinotoolkit/training_extensions/pull/2057>)
 - Support storage cache in Apache Arrow using Datumaro for action tasks (<https://github.com/openvinotoolkit/training_extensions/pull/2087>)
 - Add a simplified greedy labels postprocessing for hierarchical classification (<https://github.com/openvinotoolkit/training_extensions/pull/2064>).
+- Support auto adapting batch size (<https://github.com/openvinotoolkit/training_extensions/pull/2119>)
 
 ### Enhancements
 

@@ -48,6 +48,7 @@
     MPAConfig,
     update_or_add_custom_hook,
 )
+from otx.algorithms.common.configs.configuration_enums import BatchSizeAdaptType
 from otx.algorithms.common.utils.data import get_dataset
 from otx.algorithms.common.utils.logger import get_logger
 from otx.api.entities.datasets import DatasetEntity
@@ -295,9 +296,15 @@ def _train_model(
 
         validate = bool(cfg.data.get("val", None))
 
-        if self._hyperparams.learning_parameters.auto_decrease_batch_size:
+        if self._hyperparams.learning_parameters.auto_adapt_batch_size != BatchSizeAdaptType.NONE:
             train_func = partial(train_model, meta=deepcopy(meta), model=deepcopy(model), distributed=False)
-            adapt_batch_size(train_func, cfg, datasets, validate)
+            adapt_batch_size(
+                train_func,
+                cfg,
+                datasets,
+                validate,
+                not_increase=(self._hyperparams.learning_parameters.auto_adapt_batch_size == BatchSizeAdaptType.SAFE),
+            )
 
         train_model(
             model,

@@ -198,23 +198,29 @@ learning_parameters:
       type: UI_RULES
     visible_in_ui: true
     warning: This will automatically control the patience and interval when early stopping is enabled.
-  auto_decrease_batch_size:
+  auto_adapt_batch_size:
     affects_outcome_of: TRAINING
-    default_value: false
-    description: Find a proper batch size by training for an iteration with various batch size a few times.
+    default_value: None
+    description: Safe => Prevent GPU out of memory. Full => Find a batch size using most of GPU memory.
     editable: true
+    enum_name: BatchSizeAdaptType
     header: Decrease batch size if current batch size isn't fit to CUDA memory.
-    type: BOOLEAN
+    options:
+      NONE: "None"
+      SAFE: "Safe"
+      FULL: "Full"
+    type: SELECTABLE
     ui_rules:
       action: DISABLE_EDITING
       operator: AND
       rules: []
       type: UI_RULES
+    value: None
     visible_in_ui: true
     warning:
-      Enabling this option could reduce the actual batch size if the current setting results in out-of-memory error.
-      The learning rate also could be adjusted according to the adapted batch size.
-      This process might take some extra computation time to try a few batch size candidates.
+      Enabling this could change the actual batch size depending on the current GPU status.
+      The learning rate also could be adjusted according to the adapted batch size. This process might change
+      a model performance and take some extra computation time to try a few batch size candidates.
   type: PARAMETER_GROUP
   visible_in_ui: true
 postprocessing:

@@ -198,23 +198,29 @@ learning_parameters:
       type: UI_RULES
     visible_in_ui: true
     warning: This will automatically control the patience and interval when early stopping is enabled.
-  auto_decrease_batch_size:
+  auto_adapt_batch_size:
     affects_outcome_of: TRAINING
-    default_value: false
-    description: Find a proper batch size by training for an iteration with various batch size a few times.
+    default_value: None
+    description: Safe => Prevent GPU out of memory. Full => Find a batch size using most of GPU memory.
     editable: true
+    enum_name: BatchSizeAdaptType
     header: Decrease batch size if current batch size isn't fit to CUDA memory.
-    type: BOOLEAN
+    options:
+      NONE: "None"
+      SAFE: "Safe"
+      FULL: "Full"
+    type: SELECTABLE
     ui_rules:
       action: DISABLE_EDITING
       operator: AND
       rules: []
       type: UI_RULES
+    value: None
     visible_in_ui: true
     warning:
-      Enabling this option could reduce the actual batch size if the current setting results in out-of-memory error.
-      The learning rate also could be adjusted according to the adapted batch size.
-      This process might take some extra computation time to try a few batch size candidates.
+      Enabling this could change the actual batch size depending on the current GPU status.
+      The learning rate also could be adjusted according to the adapted batch size. This process might change
+      a model performance and take some extra computation time to try a few batch size candidates.
   type: PARAMETER_GROUP
   visible_in_ui: true
 postprocessing:

@@ -54,6 +54,7 @@
     MPAConfig,
     update_or_add_custom_hook,
 )
+from otx.algorithms.common.configs.configuration_enums import BatchSizeAdaptType
 from otx.algorithms.common.configs.training_base import TrainType
 from otx.algorithms.common.tasks.nncf_task import NNCFBaseTask
 from otx.algorithms.common.utils.data import get_dataset
@@ -406,9 +407,15 @@ def _train_model(
                 )
             )
 
-        if self._hyperparams.learning_parameters.auto_decrease_batch_size:
+        if self._hyperparams.learning_parameters.auto_adapt_batch_size != BatchSizeAdaptType.NONE:
             train_func = partial(train_model, meta=deepcopy(meta), model=deepcopy(model), distributed=False)
-            adapt_batch_size(train_func, cfg, datasets, isinstance(self, NNCFBaseTask))  # nncf needs eval hooks
+            adapt_batch_size(
+                train_func,
+                cfg,
+                datasets,
+                isinstance(self, NNCFBaseTask),  # nncf needs eval hooks
+                not_increase=(self._hyperparams.learning_parameters.auto_adapt_batch_size == BatchSizeAdaptType.SAFE),
+            )
 
         train_model(
             model,

@@ -235,23 +235,29 @@ learning_parameters:
       type: UI_RULES
     visible_in_ui: true
     warning: null
-  auto_decrease_batch_size:
+  auto_adapt_batch_size:
     affects_outcome_of: TRAINING
-    default_value: false
-    description: Find a proper batch size by training for an iteration with various batch size a few times.
+    default_value: None
+    description: Safe => Prevent GPU out of memory. Full => Find a batch size using most of GPU memory.
     editable: true
+    enum_name: BatchSizeAdaptType
     header: Decrease batch size if current batch size isn't fit to CUDA memory.
-    type: BOOLEAN
+    options:
+      NONE: "None"
+      SAFE: "Safe"
+      FULL: "Full"
+    type: SELECTABLE
     ui_rules:
       action: DISABLE_EDITING
       operator: AND
       rules: []
       type: UI_RULES
+    value: None
     visible_in_ui: true
     warning:
-      Enabling this option could reduce the actual batch size if the current setting results in out-of-memory error.
-      The learning rate also could be adjusted according to the adapted batch size.
-      This process might take some extra computation time to try a few batch size candidates.
+      Enabling this could change the actual batch size depending on the current GPU status.
+      The learning rate also could be adjusted according to the adapted batch size. This process might change
+      a model performance and take some extra computation time to try a few batch size candidates.
   type: PARAMETER_GROUP
   visible_in_ui: true
 pot_parameters:

@@ -4,11 +4,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from copy import deepcopy
+from math import sqrt
 from typing import Callable, Dict, List
 
 import numpy as np
 
-from otx.algorithms.common.adapters.torch.utils import adapt_batch_size as adapt_torch_model_bs
+from otx.algorithms.common.adapters.torch.utils import BsSearchAlgo
 from otx.algorithms.common.utils.logger import get_logger
 
 logger = get_logger()
@@ -36,7 +37,7 @@ def _set_value_at_dict_in_dict(target: Dict, key_path: str, value):
     target[keys[-1]] = value
 
 
-def adapt_batch_size(train_func: Callable, cfg, datasets: List, validate: bool = False):
+def adapt_batch_size(train_func: Callable, cfg, datasets: List, validate: bool = False, not_increase: bool = True):
     """Decrease batch size if default batch size isn't fit to current GPU device.
 
     This function just setup for single iteration training to reduce time for adapting.
@@ -49,6 +50,7 @@ def adapt_batch_size(train_func: Callable, cfg, datasets: List, validate: bool =
         meta (Dict): A dict records some meta information of a training.
         datasets (List): List of datasets.
         validate (bool): Whether do vlidation or not.
+        not_increase (bool) : Whether adapting batch size to larger value than default value or not.
     """
 
     def train_func_single_iter(batch_size):
@@ -64,12 +66,18 @@ def train_func_single_iter(batch_size):
         else:
             copied_cfg.runner["max_epochs"] = 1
 
+        # Remove some hooks due to reasons below
+        # OTXProgressHook => prevent progress bar from being 0 and 100 repeatably
+        # earlystoppinghook => if eval hook is excluded, this hook makes an error due to absence of score history
+        # CustomEvalHook => exclude validation in classification task
         idx_hooks_to_remove = []
+        hooks_to_remove = ["OTXProgressHook", "earlystoppinghook", "CustomEvalHook"]
         for i, hook in enumerate(copied_cfg.custom_hooks):
             if not validate and hook["type"] == "AdaptiveTrainSchedulingHook":
                 hook["enable_eval_before_run"] = False
-            if hook["type"] == "OTXProgressHook" or "earlystoppinghook" in hook["type"].lower():
-                idx_hooks_to_remove.append(i)
+            for hook_to_remove in hooks_to_remove:
+                if hook_to_remove.lower() in hook["type"].lower():
+                    idx_hooks_to_remove.append(i)
 
         if idx_hooks_to_remove:
             idx_hooks_to_remove.sort()
@@ -86,19 +94,25 @@ def train_func_single_iter(batch_size):
 
     default_bs = _get_batch_size(cfg)
 
-    available_bs = adapt_torch_model_bs(
+    bs_search_algo = BsSearchAlgo(
         train_func=train_func_single_iter,
-        current_bs=default_bs,
-        trainset_size=len(datasets[0]),
+        default_bs=default_bs,
+        max_bs=len(datasets[0]),
     )
+    if not_increase:
+        new_batch_size = bs_search_algo.auto_decrease_batch_size()
+    else:
+        drop_last = cfg.data.get("train_dataloader", {}).get("drop_last", False)
+        new_batch_size = bs_search_algo.find_big_enough_batch_size(drop_last)
 
-    if default_bs != available_bs:
-        _set_batch_size(cfg, available_bs)
+    if default_bs != new_batch_size:
+        _set_batch_size(cfg, new_batch_size)
         origin_lr = cfg.optimizer.lr
-        cfg.optimizer.lr *= available_bs / default_bs
+        bs_change_ratio = new_batch_size / default_bs
+        cfg.optimizer.lr *= sqrt(bs_change_ratio)  # Using root scale instead of linear scale
 
         logger.info("Adapting batch size is done.")
-        logger.info(f"Batch size is adapted : {default_bs} -> {available_bs}")
+        logger.info(f"Batch size is adapted : {default_bs} -> {new_batch_size}")
         logger.info(f"learning rate is adapted : {origin_lr} -> {cfg.optimizer.lr}")
     else:
         logger.info("Adapting batch size is done. Current batch size is availble.")

@@ -3,6 +3,6 @@
 # Copyright (C) 2023 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-from .bs_search_algo import adapt_batch_size
+from .bs_search_algo import BsSearchAlgo
 
-__all__ = ["adapt_batch_size"]
+__all__ = ["BsSearchAlgo"]