Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Auto adapting batch size #2119

Merged
merged 25 commits into from
May 12, 2023
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file.
- Action task supports multi GPU training. (<https://github.com/openvinotoolkit/training_extensions/pull/2057>)
- Support storage cache in Apache Arrow using Datumaro for action tasks (<https://github.com/openvinotoolkit/training_extensions/pull/2087>)
- Add a simplified greedy labels postprocessing for hierarchical classification (<https://github.com/openvinotoolkit/training_extensions/pull/2064>).
- Support auto adapting batch size (<https://github.com/openvinotoolkit/training_extensions/pull/2119>)

### Enhancements

Expand Down
11 changes: 9 additions & 2 deletions otx/algorithms/action/adapters/mmaction/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
MPAConfig,
update_or_add_custom_hook,
)
from otx.algorithms.common.configs.configuration_enums import BatchSizeAdaptType
from otx.algorithms.common.utils.data import get_dataset
from otx.algorithms.common.utils.logger import get_logger
from otx.api.entities.datasets import DatasetEntity
Expand Down Expand Up @@ -295,9 +296,15 @@ def _train_model(

validate = bool(cfg.data.get("val", None))

if self._hyperparams.learning_parameters.auto_decrease_batch_size:
if self._hyperparams.learning_parameters.auto_adapt_batch_size != BatchSizeAdaptType.NONE:
train_func = partial(train_model, meta=deepcopy(meta), model=deepcopy(model), distributed=False)
adapt_batch_size(train_func, cfg, datasets, validate)
adapt_batch_size(
train_func,
cfg,
datasets,
validate,
not_increase=(self._hyperparams.learning_parameters.auto_adapt_batch_size == BatchSizeAdaptType.SAFE),
)

train_model(
model,
Expand Down
20 changes: 13 additions & 7 deletions otx/algorithms/action/configs/classification/configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -198,23 +198,29 @@ learning_parameters:
type: UI_RULES
visible_in_ui: true
warning: This will automatically control the patience and interval when early stopping is enabled.
auto_decrease_batch_size:
auto_adapt_batch_size:
affects_outcome_of: TRAINING
default_value: false
description: Find a proper batch size by training for an iteration with various batch size a few times.
default_value: None
description: Safe => Prevent GPU out of memory. Full => Find a batch size using most of GPU memory.
editable: true
enum_name: BatchSizeAdaptType
header: Decrease batch size if current batch size isn't fit to CUDA memory.
type: BOOLEAN
options:
NONE: "None"
SAFE: "Safe"
FULL: "Full"
type: SELECTABLE
ui_rules:
action: DISABLE_EDITING
operator: AND
rules: []
type: UI_RULES
value: None
visible_in_ui: true
warning:
Enabling this option could reduce the actual batch size if the current setting results in out-of-memory error.
The learning rate also could be adjusted according to the adapted batch size.
This process might take some extra computation time to try a few batch size candidates.
Enabling this could change the actual batch size depending on the current GPU status.
The learning rate also could be adjusted according to the adapted batch size. This process might change
a model performance and take some extra computation time to try a few batch size candidates.
type: PARAMETER_GROUP
visible_in_ui: true
postprocessing:
Expand Down
20 changes: 13 additions & 7 deletions otx/algorithms/action/configs/detection/configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -198,23 +198,29 @@ learning_parameters:
type: UI_RULES
visible_in_ui: true
warning: This will automatically control the patience and interval when early stopping is enabled.
auto_decrease_batch_size:
auto_adapt_batch_size:
affects_outcome_of: TRAINING
default_value: false
description: Find a proper batch size by training for an iteration with various batch size a few times.
default_value: None
description: Safe => Prevent GPU out of memory. Full => Find a batch size using most of GPU memory.
editable: true
enum_name: BatchSizeAdaptType
header: Decrease batch size if current batch size isn't fit to CUDA memory.
type: BOOLEAN
options:
NONE: "None"
SAFE: "Safe"
FULL: "Full"
type: SELECTABLE
ui_rules:
action: DISABLE_EDITING
operator: AND
rules: []
type: UI_RULES
value: None
visible_in_ui: true
warning:
Enabling this option could reduce the actual batch size if the current setting results in out-of-memory error.
The learning rate also could be adjusted according to the adapted batch size.
This process might take some extra computation time to try a few batch size candidates.
Enabling this could change the actual batch size depending on the current GPU status.
The learning rate also could be adjusted according to the adapted batch size. This process might change
a model performance and take some extra computation time to try a few batch size candidates.
type: PARAMETER_GROUP
visible_in_ui: true
postprocessing:
Expand Down
11 changes: 9 additions & 2 deletions otx/algorithms/classification/adapters/mmcls/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
MPAConfig,
update_or_add_custom_hook,
)
from otx.algorithms.common.configs.configuration_enums import BatchSizeAdaptType
from otx.algorithms.common.configs.training_base import TrainType
from otx.algorithms.common.tasks.nncf_task import NNCFBaseTask
from otx.algorithms.common.utils.data import get_dataset
Expand Down Expand Up @@ -406,9 +407,15 @@ def _train_model(
)
)

if self._hyperparams.learning_parameters.auto_decrease_batch_size:
if self._hyperparams.learning_parameters.auto_adapt_batch_size != BatchSizeAdaptType.NONE:
train_func = partial(train_model, meta=deepcopy(meta), model=deepcopy(model), distributed=False)
adapt_batch_size(train_func, cfg, datasets, isinstance(self, NNCFBaseTask)) # nncf needs eval hooks
adapt_batch_size(
train_func,
cfg,
datasets,
isinstance(self, NNCFBaseTask), # nncf needs eval hooks
not_increase=(self._hyperparams.learning_parameters.auto_adapt_batch_size == BatchSizeAdaptType.SAFE),
)

train_model(
model,
Expand Down
20 changes: 13 additions & 7 deletions otx/algorithms/classification/configs/configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -235,23 +235,29 @@ learning_parameters:
type: UI_RULES
visible_in_ui: true
warning: null
auto_decrease_batch_size:
auto_adapt_batch_size:
affects_outcome_of: TRAINING
default_value: false
description: Find a proper batch size by training for an iteration with various batch size a few times.
default_value: None
description: Safe => Prevent GPU out of memory. Full => Find a batch size using most of GPU memory.
editable: true
enum_name: BatchSizeAdaptType
header: Decrease batch size if current batch size isn't fit to CUDA memory.
type: BOOLEAN
options:
NONE: "None"
SAFE: "Safe"
FULL: "Full"
type: SELECTABLE
ui_rules:
action: DISABLE_EDITING
operator: AND
rules: []
type: UI_RULES
value: None
visible_in_ui: true
warning:
Enabling this option could reduce the actual batch size if the current setting results in out-of-memory error.
The learning rate also could be adjusted according to the adapted batch size.
This process might take some extra computation time to try a few batch size candidates.
Enabling this could change the actual batch size depending on the current GPU status.
The learning rate also could be adjusted according to the adapted batch size. This process might change
a model performance and take some extra computation time to try a few batch size candidates.
type: PARAMETER_GROUP
visible_in_ui: true
pot_parameters:
Expand Down
36 changes: 25 additions & 11 deletions otx/algorithms/common/adapters/mmcv/utils/automatic_bs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
# SPDX-License-Identifier: Apache-2.0

from copy import deepcopy
from math import sqrt
from typing import Callable, Dict, List

import numpy as np

from otx.algorithms.common.adapters.torch.utils import adapt_batch_size as adapt_torch_model_bs
from otx.algorithms.common.adapters.torch.utils import BsSearchAlgo
from otx.algorithms.common.utils.logger import get_logger

logger = get_logger()
Expand Down Expand Up @@ -36,7 +37,7 @@ def _set_value_at_dict_in_dict(target: Dict, key_path: str, value):
target[keys[-1]] = value


def adapt_batch_size(train_func: Callable, cfg, datasets: List, validate: bool = False):
def adapt_batch_size(train_func: Callable, cfg, datasets: List, validate: bool = False, not_increase: bool = True):
"""Decrease batch size if default batch size isn't fit to current GPU device.

This function just setup for single iteration training to reduce time for adapting.
Expand All @@ -49,6 +50,7 @@ def adapt_batch_size(train_func: Callable, cfg, datasets: List, validate: bool =
meta (Dict): A dict records some meta information of a training.
datasets (List): List of datasets.
validate (bool): Whether do vlidation or not.
not_increase (bool) : Whether adapting batch size to larger value than default value or not.
"""

def train_func_single_iter(batch_size):
Expand All @@ -64,12 +66,18 @@ def train_func_single_iter(batch_size):
else:
copied_cfg.runner["max_epochs"] = 1

# Remove some hooks due to reasons below
# OTXProgressHook => prevent progress bar from being 0 and 100 repeatably
# earlystoppinghook => if eval hook is excluded, this hook makes an error due to absence of score history
# CustomEvalHook => exclude validation in classification task
idx_hooks_to_remove = []
hooks_to_remove = ["OTXProgressHook", "earlystoppinghook", "CustomEvalHook"]
for i, hook in enumerate(copied_cfg.custom_hooks):
if not validate and hook["type"] == "AdaptiveTrainSchedulingHook":
hook["enable_eval_before_run"] = False
if hook["type"] == "OTXProgressHook" or "earlystoppinghook" in hook["type"].lower():
idx_hooks_to_remove.append(i)
for hook_to_remove in hooks_to_remove:
if hook_to_remove.lower() in hook["type"].lower():
idx_hooks_to_remove.append(i)

if idx_hooks_to_remove:
idx_hooks_to_remove.sort()
Expand All @@ -86,19 +94,25 @@ def train_func_single_iter(batch_size):

default_bs = _get_batch_size(cfg)

available_bs = adapt_torch_model_bs(
bs_search_algo = BsSearchAlgo(
train_func=train_func_single_iter,
current_bs=default_bs,
trainset_size=len(datasets[0]),
default_bs=default_bs,
max_bs=len(datasets[0]),
)
if not_increase:
new_batch_size = bs_search_algo.auto_decrease_batch_size()
else:
drop_last = cfg.data.get("train_dataloader", {}).get("drop_last", False)
new_batch_size = bs_search_algo.find_big_enough_batch_size(drop_last)

if default_bs != available_bs:
_set_batch_size(cfg, available_bs)
if default_bs != new_batch_size:
_set_batch_size(cfg, new_batch_size)
origin_lr = cfg.optimizer.lr
cfg.optimizer.lr *= available_bs / default_bs
bs_change_ratio = new_batch_size / default_bs
cfg.optimizer.lr *= sqrt(bs_change_ratio) # Using root scale instead of linear scale

logger.info("Adapting batch size is done.")
logger.info(f"Batch size is adapted : {default_bs} -> {available_bs}")
logger.info(f"Batch size is adapted : {default_bs} -> {new_batch_size}")
logger.info(f"learning rate is adapted : {origin_lr} -> {cfg.optimizer.lr}")
else:
logger.info("Adapting batch size is done. Current batch size is availble.")
Expand Down
4 changes: 2 additions & 2 deletions otx/algorithms/common/adapters/torch/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
# Copyright (C) 2023 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

from .bs_search_algo import adapt_batch_size
from .bs_search_algo import BsSearchAlgo

__all__ = ["adapt_batch_size"]
__all__ = ["BsSearchAlgo"]
Loading