Skip to content

Commit

Permalink
feat(ckpt): add auto ckpt load and singal quit
Browse files Browse the repository at this point in the history
  • Loading branch information
wangguoteng.p authored and SolenoidWGT committed Aug 22, 2023
1 parent 29d27a6 commit 29e90eb
Show file tree
Hide file tree
Showing 5 changed files with 434 additions and 193 deletions.
123 changes: 75 additions & 48 deletions internlm/initialize/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,66 +108,93 @@ def args_sanity_check():
logger.info(f"valid_every: {data.valid_every}")

# processing the checkpoint config
if "checkpoint_every" not in gpc.config.ckpt or gpc.config.ckpt.checkpoint_every <= 0:
gpc.config.ckpt._add_item("checkpoint_every", float("inf"))

if "load_optimizer" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("load_optimizer", True)

if "save_ckpt_folder" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("save_ckpt_folder", None)

if "load_ckpt_folder" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("load_ckpt_folder", None)

if "load_model_only_folder" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("load_model_only_folder", None)

if "async_upload" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("async_upload", False)
ckpt = gpc.config.ckpt
if "enable_save_ckpt" not in ckpt:
ckpt._add_item("enable_save_ckpt", False)

# Saving checkpoint args.
if ckpt.enable_save_ckpt:
assert "checkpoint_every" in ckpt, "If enable save checkpoint, must give checkpoint_every in config.data!"
assert ckpt.checkpoint_every > 0
assert "save_ckpt_folder" in ckpt, "If enable save checkpoint, must give save_ckpt_folder in config.data!"

if "async_upload" not in ckpt:
ckpt._add_item("async_upload", False) # async defalut is False.
else:
if ckpt.async_upload:
assert "save_ckpt_folder" in ckpt
if "boto3:" not in ckpt.save_ckpt_folder:
if gpc.is_rank_for_log():
logger.warning(
"Storing ckpt on file system does not support asynchronous storage, will use sync save!"
)
ckpt.async_upload = False
else:
if "async_upload_tmp_folder" not in ckpt:
ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/")

if "snapshot_ckpt_folder" not in ckpt:
ckpt._add_item("snapshot_ckpt_folder", os.path.join(ckpt.save_ckpt_folder, "snapshot"))

if "oss_snapshot_freq" not in ckpt:
ckpt._add_item("oss_snapshot_freq", float("inf")) # if oss_snapshot_freq not given, we disable.
else:
if gpc.config.ckpt.async_upload:
assert "save_ckpt_folder" in gpc.config.ckpt
if "boto3:" not in gpc.config.ckpt.save_ckpt_folder:
if gpc.is_rank_for_log():
logger.warning(
"Storing ckpt on file system does not support asynchronous storage, will use sync save!"
)
gpc.config.ckpt.async_upload = False
else:
if "async_upload_tmp_folder" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/")

if "snapshot_ckpt_folder" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("snapshot_ckpt_folder", os.path.join(gpc.config.ckpt.save_ckpt_folder), "snapshot")

if "oss_snapshot_freq" not in gpc.config.ckpt and gpc.config.ckpt.checkpoint_every != float("inf"):
gpc.config.ckpt._add_item("oss_snapshot_freq", gpc.config.ckpt.checkpoint_every / 2)
assert gpc.config.ckpt.oss_snapshot_freq > 0

assert not (
gpc.config.ckpt.load_ckpt_folder is not None and gpc.config.ckpt.load_model_only_folder is not None
), "'load_ckpt_folder' and 'load_model_only_folder' cannot be set at the same time."

if "enable_save_ckpt" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("enable_save_ckpt", False)
ckpt._add_item("checkpoint_every", float("inf"))
ckpt._add_item("oss_snapshot_freq", float("inf"))
ckpt._add_item("save_ckpt_folder", None)
ckpt._add_item("async_upload", False)
ckpt._add_item("async_upload_tmp_folder", None)
ckpt._add_item("snapshot_ckpt_folder", None)
ckpt._add_item("snapshot_ckpt_folder", None)

# Loading checkpoint args.
if "load_model_only_folder" not in ckpt:
ckpt._add_item("load_model_only_folder", None)

if "load_ckpt_folder" not in ckpt:
ckpt._add_item("load_ckpt_folder", None)

if "load_optimizer" not in ckpt:
ckpt._add_item("load_optimizer", True)

if "stop_file_path" not in ckpt:
ckpt._add_item("stop_file_path", None)

if "load_given_ckpt" not in ckpt:
# If 'load_given_ckpt' is not given, we set it to False, so internlm can have opportunity
# to auto-load latest checkpoint.
ckpt._add_item("load_given_ckpt", False)

if ckpt.load_given_ckpt:
# Priority: load_given_ckpt(True) > latest_checkpoint > load_model_only_folder
if ckpt.load_ckpt_folder and ckpt.load_model_only_folder:
logger.warning(
"Detect 'load_ckpt_folder' and 'load_model_only_folder' set at the same time, \
and 'load_given_ckpt' is True, so internlm will load from 'load_ckpt_folder'"
)
ckpt.load_model_only_folder = None

if gpc.is_rank_for_log():
logger.info("+" * 15 + " Ckpt Info " + "+" * 15) # pylint: disable=W1201
logger.info(f"is enable save ckpt: {gpc.config.ckpt.enable_save_ckpt}")
logger.info(f"save_ckpt_folder: {gpc.config.ckpt.save_ckpt_folder}")
logger.info(f"checkpoint_every: {gpc.config.ckpt.checkpoint_every}")
logger.info(f"is enable save ckpt: {ckpt.enable_save_ckpt}")
logger.info(f"save_ckpt_folder: {ckpt.save_ckpt_folder}")
logger.info(f"checkpoint_every: {ckpt.checkpoint_every}")
logger.info(f"load_given_ckpt: {ckpt.load_given_ckpt}")

# initialization storage manager
init_storage_manager(gpc.config.ckpt)
init_storage_manager(ckpt)

# tensorboard writer config
if "enable_tb" not in gpc.config:
gpc.config._add_item("enable_tb", True)
if "tensorboard_folder" not in gpc.config:
gpc.config._add_item("tensorboard_folder", None)
gpc.config._add_item(
"tensorboard_folder", os.environ["tensorboard_folder"] if "tensorboard_folder" in os.environ else None
)
if "resume_tb_folder" not in gpc.config:
gpc.config._add_item("resume_tb_folder", None)
gpc.config._add_item(
"resume_tb_folder", os.environ["resume_tb_folder"] if "resume_tb_folder" in os.environ else None
)

# cudnn
torch.backends.cudnn.benchmark = gpc.config.get("cudnn_benchmark", False)
Expand Down
Loading

0 comments on commit 29e90eb

Please sign in to comment.