Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Enhancement] Support export logs of different ranks in debug mode #968

Merged
merged 19 commits into from
Mar 13, 2023
Merged
57 changes: 31 additions & 26 deletions docs/en/design/logging.md
Original file line number Diff line number Diff line change
Expand Up @@ -402,49 +402,54 @@ Since distributed applications will create multiple log files, we add a director

### Export logs in distributed training

When training with pytorch distributed methods, users can set `distributed=True` in config file to export multiple logs from all processes. If not specified, only master process will export log file.
When training with pytorch distributed methods, users can set `distributed=True` or `log_level='DEBUG'` in config file to export multiple logs from all processes. If not specified, only master process will export log file.

```python
logger = MMLogger.get_instance('mmengine', log_file='tmp.log', distributed=True, log_level='INFO')
# or
# logger = MMLogger.get_instance('mmengine', log_file='tmp.log', log_level='DEBUG')
```

In the case of multiple processes in a single node, or multiple processes in multiple nodes with shared storage, the exported log files have the following hierarchy

```text
# shared storage case
./tmp
├── tmp.log
├── tmp_rank1.log
├── tmp_rank2.log
├── tmp_rank3.log
├── tmp_rank4.log
├── tmp_rank5.log
├── tmp_rank6.log
── tmp_rank7.log
work_dir/20230228_141908
├── 20230306_183634_${hostname}_device0_rank0.log
├── 20230306_183634_${hostname}_device1_rank1.log
├── 20230306_183634_${hostname}_device2_rank2.log
├── 20230306_183634_${hostname}_device3_rank3.log
├── 20230306_183634_${hostname}_device4_rank4.log
├── 20230306_183634_${hostname}_device5_rank5.log
├── 20230306_183634_${hostname}_device6_rank6.log
── 20230306_183634_${hostname}_device7_rank7.log
...
── tmp_rank63.log
── 20230306_183634_${hostname}_device7_rank63.log
```

In the case of multiple processes in multiple nodes without storage, logs are organized as follows

```text
# without shared storage
# node 0:
work_dir/
└── exp_name_logs
├── exp_name.log
├── exp_name_rank1.log
├── exp_name_rank2.log
├── exp_name_rank3.log
...
└── exp_name_rank7.log
work_dir/20230228_141908
├── 20230306_183634_${hostname}_device0_rank0.log
├── 20230306_183634_${hostname}_device1_rank1.log
├── 20230306_183634_${hostname}_device2_rank2.log
├── 20230306_183634_${hostname}_device3_rank3.log
├── 20230306_183634_${hostname}_device4_rank4.log
├── 20230306_183634_${hostname}_device5_rank5.log
├── 20230306_183634_${hostname}_device6_rank6.log
├── 20230306_183634_${hostname}_device7_rank7.log

# node 7:
work_dir/
└── exp_name_logs
├── exp_name_rank56.log
├── exp_name_rank57.log
├── exp_name_rank58.log
...
└── exp_name_rank63.log
work_dir/20230228_141908
├── 20230306_183634_${hostname}_device0_rank56.log
├── 20230306_183634_${hostname}_device1_rank57.log
├── 20230306_183634_${hostname}_device2_rank58.log
├── 20230306_183634_${hostname}_device3_rank59.log
├── 20230306_183634_${hostname}_device4_rank60.log
├── 20230306_183634_${hostname}_device5_rank61.log
├── 20230306_183634_${hostname}_device6_rank62.log
├── 20230306_183634_${hostname}_device7_rank63.log
```
53 changes: 28 additions & 25 deletions docs/zh_cn/design/logging.md
Original file line number Diff line number Diff line change
Expand Up @@ -412,39 +412,42 @@ logger = MMLogger.get_instance('mmengine', log_file='tmp.log', distributed=True,

```text
# 共享存储
./tmp
├── tmp.log
├── tmp_rank1.log
├── tmp_rank2.log
├── tmp_rank3.log
├── tmp_rank4.log
├── tmp_rank5.log
├── tmp_rank6.log
── tmp_rank7.log
work_dir/20230228_141908
├── 20230306_183634_${hostname}_device0_rank0.log
├── 20230306_183634_${hostname}_device1_rank1.log
├── 20230306_183634_${hostname}_device2_rank2.log
├── 20230306_183634_${hostname}_device3_rank3.log
├── 20230306_183634_${hostname}_device4_rank4.log
├── 20230306_183634_${hostname}_device5_rank5.log
├── 20230306_183634_${hostname}_device6_rank6.log
── 20230306_183634_${hostname}_device7_rank7.log
...
── tmp_rank63.log
── 20230306_183634_${hostname}_device7_rank63.log
```

多机多卡,独立存储的情况:

```text
# 独立存储
# 设备0:
work_dir/
└── exp_name_logs
├── exp_name.log
├── exp_name_rank1.log
├── exp_name_rank2.log
├── exp_name_rank3.log
...
└── exp_name_rank7.log
work_dir/20230228_141908
├── 20230306_183634_${hostname}_device0_rank0.log
├── 20230306_183634_${hostname}_device1_rank1.log
├── 20230306_183634_${hostname}_device2_rank2.log
├── 20230306_183634_${hostname}_device3_rank3.log
├── 20230306_183634_${hostname}_device4_rank4.log
├── 20230306_183634_${hostname}_device5_rank5.log
├── 20230306_183634_${hostname}_device6_rank6.log
├── 20230306_183634_${hostname}_device7_rank7.log

# 设备7:
work_dir/
└── exp_name_logs
├── exp_name_rank56.log
├── exp_name_rank57.log
├── exp_name_rank58.log
...
└── exp_name_rank63.log
work_dir/20230228_141908
├── 20230306_183634_${hostname}_device0_rank56.log
├── 20230306_183634_${hostname}_device1_rank57.log
├── 20230306_183634_${hostname}_device2_rank58.log
├── 20230306_183634_${hostname}_device3_rank59.log
├── 20230306_183634_${hostname}_device4_rank60.log
├── 20230306_183634_${hostname}_device5_rank61.log
├── 20230306_183634_${hostname}_device6_rank62.log
├── 20230306_183634_${hostname}_device7_rank63.log
```
96 changes: 80 additions & 16 deletions mmengine/logging/logger.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
# Copyright (c) OpenMMLab. All rights reserved.
import logging
import os
import os.path as osp
import sys
import warnings
from getpass import getuser
from logging import Logger, LogRecord
from socket import gethostname
from typing import Optional, Union

from termcolor import colored
Expand Down Expand Up @@ -134,8 +138,9 @@ class MMLogger(Logger, ManagerMixin):
If `logger_name` is not defined, defaults to 'mmengine'.
log_file (str, optional): The log filename. If specified, a
``FileHandler`` will be added to the logger. Defaults to None.
log_level (str): The log level of the handler and logger. Defaults to
"NOTSET".
log_level (str): The log level of the handler. Defaults to
'INFO'. If log level is 'DEBUG', distributed logs will be saved
during distributed training.
file_mode (str): The file mode used to open log file. Defaults to 'w'.
distributed (bool): Whether to save distributed logs, Defaults to
false.
Expand All @@ -145,14 +150,16 @@ def __init__(self,
name: str,
logger_name='mmengine',
log_file: Optional[str] = None,
log_level: str = 'INFO',
log_level: Union[int, str] = 'INFO',
file_mode: str = 'w',
distributed=False):
Logger.__init__(self, logger_name)
ManagerMixin.__init__(self, name)
# Get rank in DDP mode.

rank = _get_rank()
if isinstance(log_level, str):
log_level = logging._nameToLevel[log_level]
global_rank = _get_rank()
device_id = _get_device_id()

# Config stream_handler. If `rank != 0`. stream_handler can only
# export ERROR logs.
Expand All @@ -162,24 +169,30 @@ def __init__(self,
stream_handler.setFormatter(
MMFormatter(color=True, datefmt='%m/%d %H:%M:%S'))
# Only rank0 `StreamHandler` will log messages below error level.
stream_handler.setLevel(log_level) if rank == 0 else \
if global_rank == 0:
stream_handler.setLevel(log_level)
else:
stream_handler.setLevel(logging.ERROR)
self.handlers.append(stream_handler)

if log_file is not None:
if rank != 0:
# rename `log_file` with rank suffix.
path_split = log_file.split(os.sep)
if '.' in path_split[-1]:
filename_list = path_split[-1].split('.')
filename_list[-2] = f'{filename_list[-2]}_rank{rank}'
path_split[-1] = '.'.join(filename_list)
world_size = _get_world_size()
is_distributed = (log_level <= logging.DEBUG
or distributed) and world_size > 1
if is_distributed:
filename, suffix = osp.splitext(osp.basename(log_file))
hostname = _get_host_info()
if hostname:
filename = (f'{filename}_{hostname}_device{device_id}_'
f'rank{global_rank}{suffix}')
else:
path_split[-1] = f'{path_split[-1]}_rank{rank}'
log_file = os.sep.join(path_split)
# Omit hostname if it is empty
filename = (f'{filename}_device{device_id}_'
f'rank{global_rank}{suffix}')
log_file = osp.join(osp.dirname(log_file), filename)
# Save multi-ranks logs if distributed is True. The logs of rank0
# will always be saved.
if rank == 0 or distributed:
if global_rank == 0 or is_distributed:
# Here, the default behaviour of the official logger is 'a'.
# Thus, we provide an interface to change the file mode to
# the default behaviour. `FileHandler` is not supported to
Expand All @@ -192,6 +205,11 @@ def __init__(self,
MMFormatter(color=False, datefmt='%Y/%m/%d %H:%M:%S'))
file_handler.setLevel(log_level)
self.handlers.append(file_handler)
self._log_file = log_file

@property
def log_file(self):
return self._log_file

@classmethod
def get_current_instance(cls) -> 'MMLogger':
Expand Down Expand Up @@ -288,6 +306,17 @@ def print_log(msg,
f'"silent", "current" or None, but got {type(logger)}')


def _get_world_size():
"""Support using logging module without torch."""
try:
# requires torch
from mmengine.dist import get_world_size
except ImportError:
return 1
else:
return get_world_size()


def _get_rank():
"""Support using logging module without torch."""
try:
Expand All @@ -297,3 +326,38 @@ def _get_rank():
return 0
else:
return get_rank()


def _get_device_id():
"""Get device id of current machine."""
try:
import torch
except ImportError:
return 0
else:
local_rank = int(os.getenv('LOCAL_RANK', '0'))
RangiLyu marked this conversation as resolved.
Show resolved Hide resolved
# TODO: return device id of npu and mlu.
if not torch.cuda.is_available():
return local_rank
cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
if cuda_visible_devices is None:
num_device = torch.cuda.device_count()
cuda_visible_devices = list(range(num_device))
else:
cuda_visible_devices = cuda_visible_devices.split(',')
return int(cuda_visible_devices[local_rank])


def _get_host_info() -> str:
"""Get hostname and username.

Return empty string if exception raised, e.g. ``getpass.getuser()`` will
lead to error in docker container
"""
host = ''
try:
host = f'{getuser()}@{gethostname()}'
except Exception as e:
warnings.warn(f'Host or user not found: {str(e)}')
finally:
return host
Loading