forked from karpathy/llama2.c
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexport_tokenizer_to_hf.py
84 lines (63 loc) · 3.17 KB
/
export_tokenizer_to_hf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from tokenizers import SentencePieceUnigramTokenizer, SentencePieceBPETokenizer
import sentencepiece as spm
from tokenizers import normalizers
from transformers import AutoTokenizer
import os
import glob
import shutil
from transformers import LlamaForCausalLM
"""
Load vocab and merges from result
Steps to launch :
- Copy to {path} the ckpt.pt, the tokenizer.model (renamed appropriately) and the model.bin (for tests)
- python export_tokenizer_to_hf.py
- python test_hf_model.py
"""
path = 'data/result2'
output_hf = 'musiclang/control_masking_optimized_trained' # Change this to the path where you want to save the model
tokenizer_name = 'tokenizer'
files_to_copy = glob.glob('base_tokenizer/*')
extract = True
if not os.path.exists(path):
os.makedirs(path)
def extract_model(output_hf):
program = f'python export.py {path} --version -1 --dtype fp32 --checkpoint {path}/ckpt.pt'
res = os.system(program)
print(res)
model = LlamaForCausalLM.from_pretrained(path)
# Push to hub
model.save_pretrained(output_hf, push_to_hub=True)
# Call the program
extract_model(output_hf)
if extract:
program = f'python sentencepiece_extractor.py --provider sentencepiece --vocab-output-path {path}/vocab.json --merges-output-path {path}/merges.json --model {path}/{tokenizer_name}.model'
# Call the program
os.system(program)
LOCAL_MODEL = f'{path}/tokenizer.model'
FINAL_MODEL = f'{path}'
vocab_file = f'{path}/vocab.json'
merges_file = f'{path}/merges.json'
tok = SentencePieceBPETokenizer.from_file(vocab_file, merges_file)
tok.normalizer = normalizers.Sequence([])
tok.save(os.path.join(FINAL_MODEL, 'tokenizer.json'))
# Copy all files
for file in files_to_copy:
shutil.copy(file, FINAL_MODEL)
hf_tokenizer = AutoTokenizer.from_pretrained(FINAL_MODEL)
#hf_tokenizer = tok
spm_tokenizer = spm.SentencePieceProcessor(model_file=LOCAL_MODEL)
# Test the tokenization
x = "ʝ1H<NÅɓʌʚDŽõƧɥʌʛDŽõƧɷʌʚDŽõƧò1H<NÅNjʌʘDŽõƄɓʌʖDŽôƄò1H<NÅɓʌʚDŽõƲɧʌʛDŽõƑɸʌʚDŽõƲò1H<NÅNjʌʘDŽõƄɓʌʖDŽõŷʂʌʑDŽõŨò1H<NÅñ1H<NÅñ1H<NÅñ1H<NÅñ1H<NÅɶʌʓǃóƧɓʌʚDŽõŀɥʌʛDŽõƧɮʌʓDŽõƑɻʌʓDŽõƄò1H<NÅɋʌʚDŽõŻɓʌʛDŽôŪɮʌʓDŽõƧɿʌʛDŽõųò1H<NÅɥʌʚDŽõƧɷʌʚDŽõƧò1H<NÅNjʌʚDŽõŀȏʌʘDŽõŻȹʌʖDŽóŀɋʌʘDŽõŦɮʌʖDŽõƧɿʌʑDŽôųò1H<NÅɉʌʖDŽóŻɓʌʚDŽõƧɥʌʛDŽõƧɷʌʚDŽõƧò1H<NÅNjʌʘDŽõƄɓʌʖDŽõƄò1H<NÅNjʌʑDŽõƄɓʌʓǃõŪɷʌʘǃôŮò1H<NÅòʜ"
hf_encode = hf_tokenizer.encode(x, add_special_tokens=False) #
spm_encode = spm_tokenizer.encode_as_pieces(x)
ids_hf = hf_tokenizer.encode(x, add_special_tokens=False)
ids_spm = spm_tokenizer.encode(x)
decode_hf = hf_tokenizer.decode(ids_hf)
decode_spm = spm_tokenizer.decode(ids_spm)
wrong_idxs = [idx for idx, (c,d) in enumerate(zip(ids_hf, ids_spm)) if c != d]
assert len(ids_hf) == len(ids_spm), "Mismatch between HF and SPM tokenization"
assert all([c==d for c,d in zip(ids_hf, ids_spm)]), "Mismatch between HF and SPM tokenization"
assert decode_hf == decode_spm, "Mismatch between HF and SPM tokenization"
# Upload tokenizer to HF at output_hf (to the hub)
hf_tokenizer.save_pretrained(output_hf, push_to_hub=True)
print('GOOD!')