Skip to content

Commit

Permalink
Merge pull request #59 from baai-open-internal/fix_issues
Browse files Browse the repository at this point in the history
fixed some issues
  • Loading branch information
marscrazy committed Aug 26, 2022
2 parents 6ef4190 + 351fba7 commit 35b5d9a
Show file tree
Hide file tree
Showing 11 changed files with 39 additions and 36 deletions.
8 changes: 4 additions & 4 deletions flagai/data/dataset/seq2seq/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,13 +403,13 @@ def __init__(self,
self.processor = SummmaryProcessor(self.task_name, self.data_dir,
tokenizer)
elif self.task_name in ["xsum"]:
self.processor = XSumProcessor(self.data_dir, tokenizer)
self.processor = XSumProcessor(self.data_dir+task_name, tokenizer)
elif self.task_name in ["squad_generation"]:
self.processor = SQuADProcessor(self.data_dir, tokenizer)
self.processor = SQuADProcessor(self.data_dir+task_name, tokenizer)
elif self.task_name in ['cmrc']:
self.processor = CMRCProcessor(self.data_dir, tokenizer)
self.processor = CMRCProcessor(self.data_dir+task_name, tokenizer)
elif self.task_name in ['wsc']:
self.processor = WSCProcessor(self.data_dir, tokenizer)
self.processor = WSCProcessor(self.data_dir+task_name, tokenizer)
else:
raise NotImplementedError(self.task_name)
example_list = self.processor.create_examples(dataset_type)
Expand Down
14 changes: 8 additions & 6 deletions flagai/data/tokenizer/uni_tokenizer/bpe_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,29 +51,31 @@ def __init__(self,
super().__init__(**kwargs)
self.max_len = max_len if max_len is not None else int(1e12)


self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
with open(merges_file, encoding='utf-8') as file:
bpe_data = file.read().split('\n')[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
# file.close()
if not vocab_file:
vocab = list(bytes_to_unicode().values())
vocab = vocab + [v for v in vocab]
for merge in bpe_merges:
vocab.append(''.join(merge))
self.encoder = dict(zip(vocab, range(len(vocab))))
else:
self.encoder = json.load(open(vocab_file))
with open(vocab_file) as file:
self.encoder = json.load(file)
self.decoder = {v: k for k, v in self.encoder.items()}

self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
# self.cache = {t:t for t in special_tokens}

# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
# special = "|".join(special_tokens)
self.pat = re.compile(r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
self.pat = re.compile(
r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
)

self.special_tokens = {}
self.special_tokens_decoder = {}
Expand Down
2 changes: 1 addition & 1 deletion flagai/data/tokenizer/uni_tokenizer/sp_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def vocab_size(self):
return self.sp_model.get_piece_size()

def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab = {self.convert_id_to_token(i): i for i in range(self.vocab_size)}
# vocab.update(self.added_tokens_encoder)
return vocab

Expand Down
25 changes: 12 additions & 13 deletions flagai/data/tokenizer/uni_tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,9 @@ def __init__(self,
self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
self._command_token_tokens = list(self.command_token_map.keys())

def get_vocab(self):
return self.text_tokenizer.get_vocab()

def get_command_id(self, name):
"""get command token corresponding to `name`"""
return self.command_name_map[name].Id
Expand Down Expand Up @@ -318,6 +321,12 @@ def _encode(self, text):
ids = self.text_tokenizer.convert_tokens_to_ids(tokens)
return ids

def convert_tokens_to_ids(self, tokens):
return self.text_tokenizer.convert_tokens_to_ids(tokens)

def convert_ids_to_tokens(self, ids):
return self.text_tokenizer.convert_ids_to_tokens(ids)

def EncodeAsTokens(self, text, process_fn=None):
"""convert wordpiece token to Id"""
processed_text = text
Expand Down Expand Up @@ -585,18 +594,8 @@ def tokenize_as_tensor(self, texts):
sot_token = self.get_command_id('sot')
eot_token = self.get_command_id('eot')
return self.text_tokenizer.tokenize(texts, sot_token=sot_token, eot_token=eot_token)
# if isinstance(texts, str):
# texts = [texts]

# sot_token = self.get_command_id('sot')
# eot_token = self.get_command_id('eot')
# all_tokens = [[sot_token] + self.encode(text) + [eot_token] for text in texts]
# result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)

# for i, tokens in enumerate(all_tokens):
# if len(tokens) > context_length:
# tokens = tokens[:context_length] # Truncate
# result[i, :len(tokens)] = torch.tensor(tokens)
# return result

def tokenize(self, texts):
return self.text_tokenizer.tokenize(texts)


3 changes: 3 additions & 0 deletions flagai/data/tokenizer/uni_tokenizer/wp_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ def __init__(self, vocab_file=None, do_basic_tokenize=True,
def vocab_size(self):
return len(self.vocab)

def get_vocab(self):
return self.vocab

def word_piece(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
Expand Down
5 changes: 2 additions & 3 deletions flagai/model/bert_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,8 @@ def forward(self,
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
if attention_mask is not None:
extended_attention_mask = extended_attention_mask * attention_mask
extended_attention_mask = extended_attention_mask.unsqueeze(
1).unsqueeze(2)
# extended_attention_mask = extended_attention_mask.unsqueeze(
# 1).unsqueeze(2)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
Expand Down Expand Up @@ -356,7 +356,6 @@ def forward(self, **data):
input_shape = input_ids.shape
seq_len = input_shape[1]
a_mask = self.make_unilm_mask(token_type_ids, seq_len)

encoder_out, pooler_out = self.model(
input_ids,
token_type_ids,
Expand Down
10 changes: 5 additions & 5 deletions flagai/model/predictor/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,12 +168,12 @@ def predict_ner(self,
model.eval()
device = next(model.parameters()).device
tokenizer = self.tokenizer
tokens = tokenizer.text_tokenizer.tokenize(text)
#maxlen=maxlen,
#add_spatial_tokens=True)

tokens = tokenizer.tokenize(text)
# maxlen=maxlen,
# add_spatial_tokens=True)
mapping = tokenizer.rematch(text, tokens)
token_ids = tokenizer.text_tokenizer.convert_tokens_to_ids(tokens)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_ids = torch.tensor([token_ids], dtype=torch.long, device=device)

trans = model.state_dict().get("crf_layer.trans", None)
Expand Down
2 changes: 1 addition & 1 deletion flagai/model/predictor/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,7 +670,7 @@ def glm_beamsearch(model, tokenizer, text, out_max_length, beam_size): #
def bert_beamsearch(model, tokenizer, text, input_max_length, out_max_length,
beam_size):
tokenizer_out = tokenizer.encode_plus(text, max_length=input_max_length)
vocab = tokenizer.text_tokenizer.vocab
vocab = tokenizer.get_vocab()
token_ids = tokenizer_out["input_ids"]
token_ids = np.array(token_ids).reshape(1, -1)
out_puts_ids = bert_beam_search(model,
Expand Down
1 change: 1 addition & 0 deletions tests/test_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def setUp(self) -> None:
BertForSequenceLabelingCRF]
self.model_name = "RoBERTa-base-ch"
self.bert_path = "./checkpoints/RoBERTa-base-ch/config.json"
# self.tokenizer = BertTokenizer("./checkpoints/RoBERTa-base-ch/vocab.txt")
self.tokenizer = Tokenizer.from_pretrained(self.model_name)
print("loading bert model successfully!")

Expand Down
4 changes: 2 additions & 2 deletions tests/test_glm_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@ def test_init_trainer_pytorch(self):
tokenizer = Tokenizer.from_pretrained(model_name)

train_dataset = Seq2SeqDataset(task_name=task_name,
data_dir='./data/cmrc/',
data_dir='./data/',
dataset_type='train',
tokenizer=tokenizer)
valid_dataset = Seq2SeqDataset(task_name=task_name,
data_dir='./data/cmrc/',
data_dir='./data/',
dataset_type='dev',
tokenizer=tokenizer)

Expand Down
1 change: 0 additions & 1 deletion tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ def test_tokenizer_cpm1(self):
'fried chicken makes me happy', 'DecodeIds Error')

def test_tokenizer_opt(self):
# tokenizer = OPTTokenizer(tokenizer_model_type="facebook/opt-125m")
tokenizer = Tokenizer.from_pretrained('opt-125m-en')
self.assertEqual(tokenizer.encode("day"), [1208], '')
self.assertEqual(tokenizer.encode_plus("fried chicken makes me happy")["input_ids"],
Expand Down

0 comments on commit 35b5d9a

Please sign in to comment.