Skip to content

Commit

Permalink
Merge pull request #20 from baai-open-internal/transform_tokenizer
Browse files Browse the repository at this point in the history
Transform tokenizer
  • Loading branch information
marscrazy authored Aug 25, 2022
2 parents 6076287 + 25b659b commit 8cffa38
Show file tree
Hide file tree
Showing 61 changed files with 2,085 additions and 605 deletions.
2 changes: 1 addition & 1 deletion doc_zh/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ class GLMTitleGenerationCollateFN():
```python
train_src, train_tgt = read_file()
print('-----------train data length:', len(train_src))
my_collate_fn = GLMTitleGenerationCollateFN(pad_id=tokenizer.get_command('pad').Id)
my_collate_fn = GLMTitleGenerationCollateFN(pad_id=tokenizer.get_command_id('pad'))
train_dataset = GLMTitleGenerationDataset(train_src,
train_tgt)
```
Expand Down
2 changes: 1 addition & 1 deletion doc_zh/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ class GLMPoetryDynamicCollateFN():
```python
train_src, train_tgt = read_file()
print('-----------train data length:', len(train_src))
my_collate_fn = GLMPoetryDynamicCollateFN(pad_id=tokenizer.get_command('pad').Id)
my_collate_fn = GLMPoetryDynamicCollateFN(pad_id=tokenizer.get_command_id('pad'))
train_dataset = GLMPoetryDataset(train_src,
train_tgt)
```
Expand Down
2 changes: 1 addition & 1 deletion docs/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ class GLMTitleGenerationCollateFN():
```python
train_src, train_tgt = read_file()
print('-----------train data length:', len(train_src))
my_collate_fn = GLMTitleGenerationCollateFN(pad_id=tokenizer.get_command('pad').Id)
my_collate_fn = GLMTitleGenerationCollateFN(pad_id=tokenizer.get_command_id('pad'))
train_dataset = GLMTitleGenerationDataset(train_src,
train_tgt)
```
Expand Down
2 changes: 1 addition & 1 deletion docs/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ class GLMPoetryDynamicCollateFN():
```python
train_src, train_tgt = read_file()
print('-----------train data length:', len(train_src))
my_collate_fn = GLMPoetryDynamicCollateFN(pad_id=tokenizer.get_command('pad').Id)
my_collate_fn = GLMPoetryDynamicCollateFN(pad_id=tokenizer.get_command_id('pad'))
train_dataset = GLMPoetryDataset(train_src,
train_tgt)
```
Expand Down
2 changes: 1 addition & 1 deletion examples/bert_title_generation_english/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
maxlen = 512
auto_loader = AutoLoader(
"seq2seq",
model_name="bert-base-uncased",
model_name="BERT-base-en",
model_dir=model_dir,
)
model = auto_loader.get_model()
Expand Down
4 changes: 2 additions & 2 deletions examples/clip/inference_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
def inference():
image = Image.open("./CLIP.png")
image = transform(image).unsqueeze(0).to(device)
text = tokenizer.tokenize(["a diagram", "a dog", "a cat"]).to(device)
text = tokenizer.tokenize_as_tensor(["a diagram", "a dog", "a cat"]).to(device)

with torch.no_grad():
image_features = model.encode_image(image)
Expand All @@ -27,4 +27,4 @@ def inference():
print(text_probs.cpu().numpy()[0].tolist())

if __name__=="__main__":
inference()
inference()
2 changes: 1 addition & 1 deletion examples/clip/train_clip_deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
num_checkpoints=1,
hostfile="./deepspeed/hostfile",
training_script=__file__,
deepspeed_config="./deepspeed/deepspeed.json"
deepspeed_config="./deepspeed.json"
)
loader = AutoLoader(task_name="txt_img_matching",#contrastive learning
model_name="clip-base-p32-224",
Expand Down
7 changes: 4 additions & 3 deletions examples/glm_blank_filling/glm_generate_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,17 @@
import torch

from flagai.model.glm_model import GLMModel
from flagai.data.tokenizer import GLMLargeChTokenizer
from flagai.data.tokenizer import Tokenizer
from flagai.model.predictor.predictor import Predictor
if __name__ == "__main__":
"""Main training program."""
print('Generate Samples')
# Random seeds for reproducability.
# Model,
model = GLMModel.from_pretrain(model_name='GLM-large-ch',
model_name = 'GLM-large-ch'
model = GLMModel.from_pretrain(model_name=model_name,
download_path="./state_dict/")
tokenizer = GLMLargeChTokenizer()
tokenizer = Tokenizer.from_pretrained(model_name)

model.cuda(torch.cuda.current_device())

Expand Down
2 changes: 1 addition & 1 deletion examples/glm_poetry_generation/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def __call__(self, batch):
train_src, train_tgt = read_file()
print('-----------train data length:', len(train_src))
my_collate_fn = GLMPoetryDynamicCollateFN(
pad_id=tokenizer.get_command('pad').Id)
pad_id=tokenizer.get_command_id('pad'))
train_dataset = BertSeq2seqDataset(train_src, train_tgt)

trainer.train(model, train_dataset=train_dataset, collate_fn=my_collate_fn)
13 changes: 5 additions & 8 deletions examples/glm_pretrain/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#
# Licensed under the Apache License, Version 2.0 (the "License")

from flagai.data.tokenizer import GLMLargeChTokenizer
from flagai.data.tokenizer import Tokenizer
from flagai.model.glm_model import GLMForSeq2Seq
from flagai.trainer import Trainer
from flagai.data.dataset import ConstructBlockStrategy
Expand All @@ -24,14 +24,11 @@
load_dir=None,
lr=1e-4,
save_interval=10)

model = GLMForSeq2Seq.from_pretrain(model_name='GLM-large-ch')

model_name = 'GLM-large-ch'
tokenizer = Tokenizer.from_pretrained(model_name)
ds_args = PretrainDatasetArguments()

tokenizer = GLMLargeChTokenizer()

ds_args = add_args(ds_args, tokenizer)
model = GLMForSeq2Seq.from_pretrain(model_name=model_name)

def create_dataset(tokenizer, should_split):
dataset = get_dataset_lazy("./examples/glm_pretrain/data",
Expand Down Expand Up @@ -59,7 +56,7 @@ def create_dataset(tokenizer, should_split):
collate_fn = None
if ds_args.block_lm:
collate_fn = ConstructBlockStrategy(
tokenizer, 512, eod_token=tokenizer.get_command('eos').Id)
tokenizer, 512, eod_token=tokenizer.get_command_id('eos'))
metric_methods = DEFAULT_METRICS['pretrain']
trainer.train(model,
collate_fn=collate_fn,
Expand Down
6 changes: 3 additions & 3 deletions examples/glm_seq2seq/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Licensed under the Apache License, Version 2.0 (the "License")
from flagai.trainer import Trainer
from flagai.model.glm_model import GLMForSeq2Seq
from flagai.data.tokenizer import GLMLargeEnWordPieceTokenizer, GLMLargeChTokenizer
from flagai.data.tokenizer import Tokenizer
from flagai.data.dataset import Seq2SeqDataset
from flagai.test_utils import Seq2SeqCollateArguments
from flagai.data.dataset.superglue.control import DEFAULT_METRICS, CH_TASKS
Expand All @@ -27,12 +27,12 @@
print("downloading...")

if task_name in CH_TASKS:
tokenizer = GLMLargeChTokenizer()
model_name = 'GLM-large-ch'
else:
tokenizer = GLMLargeEnWordPieceTokenizer()
model_name = 'GLM-large-en'

tokenizer = Tokenizer.from_pretrained(model_name)

train_dataset = Seq2SeqDataset(task_name=task_name,
data_dir='./datasets/',
dataset_type='train',
Expand Down
5 changes: 3 additions & 2 deletions examples/glm_superglue/train_10b_clue.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
from flagai.trainer import Trainer
from flagai.model.glm_model import GLMForSingleTokenCloze
from flagai.data.tokenizer import GLMLargeChTokenizer
from flagai.data.tokenizer import Tokenizer
from flagai.metrics import accuracy_metric
from flagai.data.dataset import SuperGlueDataset
from flagai.test_utils import CollateArguments
Expand All @@ -21,11 +21,12 @@
save_dir="./glm_superglue_en",
save_interval=1)

model_name = "GLM-large-ch"
model = GLMForSingleTokenCloze.from_pretrain(download_path="/mnt/test_10b_models",
model_name="GLM-large-ch")


tokenizer = GLMLargeChTokenizer()
tokenizer = Tokenizer.from_pretrained("GLM-large-ch")
train_dataset = SuperGlueDataset(task_name=task_name,
data_dir='./datasets/',
dataset_type='train',
Expand Down
8 changes: 4 additions & 4 deletions examples/glm_superglue/train_10b_superglue.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Licensed under the Apache License, Version 2.0 (the "License")
from flagai.trainer import Trainer
from flagai.model.glm_model import GLMForSingleTokenCloze
from flagai.data.tokenizer import GLM10bENBPETokenizer, GLMLargeEnWordPieceTokenizer
from flagai.data.tokenizer import Tokenizer
from flagai.metrics import accuracy_metric
from flagai.data.dataset import SuperGlueDataset
from flagai.test_utils import CollateArguments
Expand All @@ -28,11 +28,11 @@
# deepspeed_config='./deepspeed.json',
# training_script=__file__)

model_name = "GLM-large-en"
model = GLMForSingleTokenCloze.from_pretrain(download_path="/mnt/test_10b_models",
model_name="GLM-large-en")
model_name=model_name)

tokenizer = GLMLargeEnWordPieceTokenizer()

tokenizer = Tokenizer.from_pretrained(model_name)
train_dataset = SuperGlueDataset(task_name=task_name,
data_dir='./datasets/',
dataset_type='train',
Expand Down
12 changes: 4 additions & 8 deletions examples/glm_superglue/train_prefix.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@
#
# Licensed under the Apache License, Version 2.0 (the "License")
from flagai.trainer import Trainer
from flagai.model.glm_model import GLMForSingleTokenCloze, GLMForMultiTokenCloze, GLMForMultiTokenClozeFast, GLMForSequenceClassification
from flagai.data.tokenizer import GLMLargeEnWordPieceTokenizer, GLMLargeChTokenizer
from flagai.model.glm_model import GLMForSequenceClassification
from flagai.data.tokenizer import Tokenizer

from flagai.data.dataset import SuperGlueDataset
from flagai.test_utils import CollateArguments
from flagai.data.dataset.superglue.control import DEFAULT_METRICS, MULTI_TOKEN_TASKS, CH_TASKS
import unittest
from flagai.data.dataset import ConstructSuperglueStrategy


Expand All @@ -32,13 +31,10 @@

if task_name in CH_TASKS:
model_name = 'GLM-large-ch'
tokenizer = GLMLargeChTokenizer(add_block_symbols=True,
add_task_mask=False,
add_decoder_mask=False,
fix_command_token=True)
add_block_symbols=True,
else:
model_name = 'GLM-large-en'
tokenizer = GLMLargeEnWordPieceTokenizer()
tokenizer = Tokenizer.from_pretrained(model_name)

model = GLMForSequenceClassification.from_pretrain(model_name=model_name, spell_length=2,
class_num=3, tune_prefix_layers=1)
Expand Down
26 changes: 17 additions & 9 deletions examples/glm_superglue/train_qqp_deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,46 @@
#
# Licensed under the Apache License, Version 2.0 (the "License")
from flagai.trainer import Trainer
from flagai.model.glm_model import GLMForSingleTokenCloze
from flagai.data.tokenizer import GLM10bENBPETokenizer, GLMLargeEnWordPieceTokenizer
from flagai.model.glm_model import GLMForSingleTokenCloze, GLMForMultiTokenCloze
from flagai.data.tokenizer import Tokenizer
from flagai.metrics import accuracy_metric
from flagai.data.dataset import SuperGlueDataset
from flagai.test_utils import CollateArguments
from flagai.data.dataset.superglue.control import DEFAULT_METRICS, MULTI_TOKEN_TASKS, CH_TASKS

task_name = 'qqp'
task_name = 'boolq'
trainer = Trainer(env_type='deepspeed',
epochs=10,
epochs=1000,
batch_size=512,
eval_interval=100,
log_interval=10,
save_interval = 1e5,
save_interval=1e5,
gradient_accumulation_steps=5,
checkpoint_activations=True,
fp16=True,
warm_up=0.1,
weight_decay=0.1,
save_dir="./qqp",
master_ip='127.0.0.1',
master_port=17887,
master_port=17810,
num_nodes=1,
num_gpus=2,
hostfile='./hostfile',
deepspeed_config='./deepspeed.json',
training_script=__file__)

model = GLMForSingleTokenCloze.from_pretrain(download_path="/mnt/test_10b_models",
model_name="GLM-large-en")
model_name = "GLM-large-en"
tokenizer = Tokenizer.from_pretrained(model_name)

if task_name in MULTI_TOKEN_TASKS:
model = GLMForMultiTokenCloze.from_pretrain(
download_path="/mnt/test_10b_models", model_name=model_name)
else:
model = GLMForSingleTokenCloze.from_pretrain(download_path="/mnt/test_10b_models",
model_name=model_name)

tokenizer = GLMLargeEnWordPieceTokenizer()
# model = GLMForSingleTokenCloze.from_pretrain(download_path="/mnt/test_10b_models",
# model_name="GLM-large-en")
train_dataset = SuperGlueDataset(task_name=task_name,
data_dir='./datasets/',
dataset_type='train',
Expand Down
10 changes: 5 additions & 5 deletions examples/glm_superglue/train_qqp_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@

from flagai.trainer import Trainer
from flagai.model.glm_model import GLMForSingleTokenCloze
from flagai.model.bert_model import BertForClsClassifier
from flagai.data.tokenizer import GLM10bENBPETokenizer, GLMLargeEnWordPieceTokenizer
from flagai.data.tokenizer import Tokenizer
from flagai.metrics import accuracy_metric
from flagai.data.dataset import SuperGlueDataset
from flagai.test_utils import CollateArguments
Expand All @@ -28,11 +27,12 @@
warm_up=0.1,
save_dir="./glm_large_qqp_pytorch")

model_name = "GLM-large-en"
tokenizer = Tokenizer.from_pretrained(model_name)
model = GLMForSingleTokenCloze.from_pretrain(download_path="/mnt/test_10b_models",
model_name="GLM-large-en")
model_name=model_name)


#tokenizer = GLM10bENBPETokenizer()
tokenizer = GLMLargeEnWordPieceTokenizer()

train_dataset = SuperGlueDataset(task_name=task_name,
data_dir='./datasets/',
Expand Down
7 changes: 4 additions & 3 deletions examples/glm_superglue/train_qqp_pytorch_fp16.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Licensed under the Apache License, Version 2.0 (the "License")
from flagai.trainer import Trainer
from flagai.model.glm_model import GLMForSingleTokenCloze
from flagai.data.tokenizer import GLM10bENBPETokenizer, GLMLargeEnWordPieceTokenizer
from flagai.data.tokenizer import Tokenizer
from flagai.metrics import accuracy_metric
from flagai.data.dataset import SuperGlueDataset
from flagai.test_utils import CollateArguments
Expand All @@ -24,9 +24,10 @@
warm_up=0.1,
save_dir="./glm_large_qqp_pytorch_fp16")

model_name = "GLM-large-en"
model = GLMForSingleTokenCloze.from_pretrain(download_path="/mnt/test_10b_models",
model_name="GLM-large-en")
tokenizer = GLMLargeEnWordPieceTokenizer()
model_name=model_name)
tokenizer = Tokenizer.from_pretrained(model_name)
train_dataset = SuperGlueDataset(task_name=task_name,
data_dir='./datasets/',
dataset_type='train',
Expand Down
8 changes: 4 additions & 4 deletions examples/glm_superglue/train_qqp_pytorchddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Licensed under the Apache License, Version 2.0 (the "License")
from flagai.trainer import Trainer
from flagai.model.glm_model import GLMForSingleTokenCloze
from flagai.data.tokenizer import GLM10bENBPETokenizer, GLMLargeEnWordPieceTokenizer
from flagai.data.tokenizer import Tokenizer
from flagai.metrics import accuracy_metric
from flagai.data.dataset import SuperGlueDataset
from flagai.test_utils import CollateArguments
Expand All @@ -29,11 +29,11 @@
hostfile='./hostfile',
training_script=__file__)

model_name = "GLM-large-en"
model = GLMForSingleTokenCloze.from_pretrain(download_path="/mnt/test_10b_models",
model_name="GLM-large-en")
model_name=model_name)

#tokenizer = GLM10bENBPETokenizer()
tokenizer = GLMLargeEnWordPieceTokenizer()
tokenizer = Tokenizer.from_pretrained(model_name)
train_dataset = SuperGlueDataset(task_name=task_name,
data_dir='./datasets/',
dataset_type='train',
Expand Down
2 changes: 1 addition & 1 deletion examples/glm_title_generation/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def __call__(self, batch):

sents_src, sents_tgt = read_file()
my_collate_fn = GLMPoetryDynamicCollateFN(
pad_id=tokenizer.get_command('pad').Id)
pad_id=tokenizer.get_command_id('pad'))

data_len = len(sents_tgt)
train_size = int(data_len * 0.8)
Expand Down
1 change: 0 additions & 1 deletion examples/opt/generate_opt_1.3b.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

from flagai.model.predictor.predictor import Predictor
from flagai.auto_model.auto_loader import AutoLoader

Expand Down
1 change: 0 additions & 1 deletion examples/roberta_faq/1_construct_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import numpy as np
from tqdm import tqdm
import collections
import faiss

faq_data_path = "./data/financezhidao_filter.csv"
answer_save_path = "./data/finance_fqa.json"
Expand Down
Loading

0 comments on commit 8cffa38

Please sign in to comment.