Skip to content

Commit

Permalink
add ernie pipeline (#858)
Browse files Browse the repository at this point in the history
  • Loading branch information
ForFishes authored Nov 1, 2022
1 parent f9c2ba9 commit 5753377
Show file tree
Hide file tree
Showing 16 changed files with 248 additions and 94 deletions.
10 changes: 7 additions & 3 deletions ppfleetx/configs/nlp/ernie/pretrain_ernie_base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,11 @@ Data:
shuffle: False
drop_last: True
loader:
num_workers: 1
num_workers: 0
return_list: False
collate_fn: ernie_collate_data
collate_fn:
name: ErnieCollateData
micro_batch_size:

Eval:
dataset:
Expand All @@ -96,7 +98,9 @@ Data:
loader:
num_workers: 1
return_list: False
collate_fn: ernie_collate_data
collate_fn:
name: ErnieCollateData
micro_batch_size: 1

Optimizer:
name: FusedAdamW
Expand Down
44 changes: 44 additions & 0 deletions ppfleetx/configs/nlp/ernie/pretrain_ernie_base_175B_mp8_pp16.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
_base_: ./pretrain_ernie_base.yaml

Global:
global_batch_size:
local_batch_size: 512
micro_batch_size: 1


Model:
vocab_size: 40000
hidden_size: 12288
num_hidden_layers: 96
num_attention_heads: 96
intermediate_size:
hidden_act: "gelu"
hidden_dropout_prob: 0.1
attention_probs_dropout_prob: 0.1
max_position_embeddings: 512
type_vocab_size: 4
initializer_range: 0.02
pad_token_id: 0
task_type_vocab_size: 3
task_id: 0
use_task_id: True
use_recompute: True


Data:
Train:
dataset:
tokenizer_type: ernie-1.0-base-zh-cw
Eval:
dataset:
tokenizer_type: ernie-1.0-base-zh-cw


Distributed:
dp_degree: 1
mp_degree: 8
pp_degree: 16
sharding:
sharding_degree: 1
sharding_stage: 1
sharding_offload: False
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ Global:

Model:
vocab_size: 40000
hidden_size: 768
num_hidden_layers: 1
num_attention_heads: 12
hidden_size: 1024
num_hidden_layers: 24
num_attention_heads: 16
intermediate_size:
hidden_act: "gelu"
hidden_dropout_prob: 0.1
Expand All @@ -24,6 +24,7 @@ Model:
use_task_id: True
use_recompute: False


Data:
Train:
dataset:
Expand Down
10 changes: 5 additions & 5 deletions ppfleetx/configs/nlp/ernie/pretrain_ernie_base_3D.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ _base_: ./pretrain_ernie_base.yaml

Global:
global_batch_size:
local_batch_size: 4
micro_batch_size: 4
local_batch_size: 8
micro_batch_size: 1


Model:
Expand Down Expand Up @@ -34,9 +34,9 @@ Data:


Distributed:
dp_degree: 1
mp_degree: 8
pp_degree: 1
dp_degree: 2
mp_degree: 2
pp_degree: 2
sharding:
sharding_degree: 1
sharding_stage: 1
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
_base_: ./pretrain_ernie_base.yaml

Global:
global_batch_size:
local_batch_size: 512
micro_batch_size: 1


Model:
vocab_size: 40000
hidden_size: 4096
num_hidden_layers: 32
num_attention_heads: 32
intermediate_size:
hidden_act: "gelu"
hidden_dropout_prob: 0.1
attention_probs_dropout_prob: 0.1
max_position_embeddings: 512
type_vocab_size: 4
initializer_range: 0.02
pad_token_id: 0
task_type_vocab_size: 3
task_id: 0
use_task_id: True
use_recompute: True

Data:
Train:
dataset:
tokenizer_type: ernie-1.0-base-zh-cw
Eval:
dataset:
tokenizer_type: ernie-1.0-base-zh-cw


Distributed:
dp_degree: 1
mp_degree: 8
pp_degree: 16
sharding:
sharding_degree: 1
sharding_stage: 1
sharding_offload: False
13 changes: 10 additions & 3 deletions ppfleetx/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,16 @@ def build_dataloader(config, mode):
if 'loader' in config[mode].keys():
config_loader = config[mode].loader
config_loader = copy.deepcopy(config_loader)
collate_fn_name = config_loader.pop('collate_fn', None)
collate_fn = getattr(
utils, collate_fn_name) if collate_fn_name is not None else None

collate_fn_cfg = config_loader.pop('collate_fn', None)
if isinstance(collate_fn_cfg, str):
collate_fn = getattr(
utils, collate_fn_cfg) if collate_fn_cfg is not None else None
elif isinstance(collate_fn_cfg, dict):
collate_fn_class_name = collate_fn_cfg.pop("name")
collate_fn = eval("utils.{}".format(collate_fn_class_name))(
**collate_fn_cfg)
logger.debug("build collate_fn({}) success...".format(collate_fn))

def worker_init_fn(worker_id):
""" set seed in subproces for dataloader when num_workers > 0"""
Expand Down
6 changes: 4 additions & 2 deletions ppfleetx/data/dataset/ernie/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,8 +336,10 @@ def create_masked_lm_predictions(tokens,
return (output_tokens, masked_lm_positions, masked_lm_labels,
token_boundary)

num_to_predict = min(max_predictions_per_seq,
max(1, int(round(len(tokens) * masked_lm_prob))))
# NOTE(shenliang03): to avoid num_to_predict < 1
num_to_predict = max(1,
min(max_predictions_per_seq,
max(1, int(round(len(tokens) * masked_lm_prob)))))

ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)
if not geometric_dist:
Expand Down
1 change: 0 additions & 1 deletion ppfleetx/data/dataset/ernie/ernie_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,6 @@ def get_train_data_file(input_dir):
if (os.path.isfile(os.path.join(input_dir, f)) and "_idx.npz" in
str(f))
]
# print(">>>> files", files)
files = [x.replace("_idx.npz", "") for x in files]

if len(files) > 1:
Expand Down
77 changes: 48 additions & 29 deletions ppfleetx/data/utils/batch_collate_fn.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,35 +95,54 @@ def gpt_collate_fn(batch):
return Tuple([Stack() for raw in zip(*batch)])(batch)


def ernie_collate_data(data, stack_fn=Stack()):
num_fields = len(data[0])
out = [None] * num_fields
# 0. input_ids,
# 1. segment_ids,
# 2. input_mask,
# 3. masked_lm_positions,
# 4. masked_lm_labels,
# 5. next_sentence_labels
for i in (0, 1, 2, 5):
out[i] = stack_fn([x[i] for x in data])
out[5] = out[5].reshape([-1, 1])
batch_size, seq_length = out[0].shape
size = num_mask = sum(len(x[3]) for x in data)
# masked_lm_positions
# Organize as a 1D tensor for gather or use gather_nd
if size % 8 != 0:
size += 8 - (size % 8)
out[3] = np.full(size, 0, dtype=np.int32)
# masked_lm_labels
out[4] = np.full([size, 1], -1, dtype=np.int64)
mask_token_num = 0
for i, x in enumerate(data):
for j, pos in enumerate(x[3]):
out[3][mask_token_num] = i * seq_length + pos
out[4][mask_token_num] = x[4][j]
mask_token_num += 1

return out
class ErnieCollateData():
def __init__(self, micro_batch_size=1):
self.micro_batch_size = micro_batch_size

def generate_data(self, data, stack_fn=Stack()):
num_fields = len(data[0])
out = [None] * num_fields
# 0. input_ids,
# 1. segment_ids,
# 2. input_mask,
# 3. masked_lm_positions,
# 4. masked_lm_labels,
# 5. next_sentence_labels
for i in (0, 1, 2, 5):
out[i] = stack_fn([x[i] for x in data])
out[5] = out[5].reshape([-1, 1])
batch_size, seq_length = out[0].shape
size = num_mask = sum(len(x[3]) for x in data)
# masked_lm_positions
# Organize as a 1D tensor for gather or use gather_nd
if size % 8 != 0:
size += 8 - (size % 8)
out[3] = np.full(size, 0, dtype=np.int32)

# masked_lm_labels
out[4] = np.full([size, 1], -1, dtype=np.int64)
mask_token_num = 0
for i, x in enumerate(data):
for j, pos in enumerate(x[3]):
out[3][mask_token_num] = i * seq_length + pos
out[4][mask_token_num] = x[4][j]
mask_token_num += 1
return out

def __call__(self, data):
accumulate_steps = len(data) // self.micro_batch_size
if accumulate_steps == 1:
return self.generate_data(data)
else:
self.micro_batch_size = len(data) // accumulate_steps
all_data = [[] for _ in range(6)]
for acc_step in range(accumulate_steps):
tmp = self.generate_data(
data[acc_step * self.micro_batch_size:(acc_step + 1) *
self.micro_batch_size])
for i in range(6):
all_data[i].append(tmp[i])
return all_data


def imagen_collate_fn(batch):
Expand Down
Loading

0 comments on commit 5753377

Please sign in to comment.