-
Notifications
You must be signed in to change notification settings - Fork 80
/
tokenize_text.py
124 lines (104 loc) · 4.13 KB
/
tokenize_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import spacy
import io
import argparse
import glob
import os
import tqdm
import contextlib
from pytorch_pretrained_bert import GPT2Tokenizer
import numpy as np
from multiprocessing import Pool, current_process
from functools import partial
import itertools
def every(it, n):
"""every(ABCDEFG, 2) --> AB CD EF G"""
toexit = False
while not toexit:
batch = []
for i in range(n):
try:
batch.append(next(it))
except StopIteration:
toexit = True
if not batch:
break
yield batch
def tokenizeGpt2Spawn(args, nproc=None, **kwargs):
# Make sure output dir exists.
if not os.path.exists(args.output_dir):
os.mkdir(args.output_dir)
extraction_file_paths = glob.iglob(args.input_glob)
if nproc == 1:
for batch in every(extraction_file_paths, 6):
print(tokenizeGpt2(batch, args, **kwargs))
return
with Pool() as pool:
out = pool.imap_unordered(
partial(tokenizeGpt2, args=args, **kwargs),
every(extraction_file_paths, args.file_bs))
omitted, total = zip(*out)
print(f'\n\nSkipped {sum(omitted)}/{sum(total)} files')
def tokenizeGpt2(extraction_file_paths, args, min_length=20):
"""Tokenize text using GPT-2's pretrained BPE encoder.
Saves as compressed npz files that can be loaded using `with np.load('filename.npz') as a: a['arr_0']`.
Omit files smaller than min_length tokens, which are likely low quality.
"""
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
EOT = tokenizer.encoder['<|endoftext|>']
omitted_files = 0
combined = []
p = current_process()
index = p._identity[0] if p._identity else 0
bar = tqdm.tqdm(extraction_file_paths, position=index, desc=f'proc {index}')
for extraction_file_path in bar:
_, filename = os.path.split(extraction_file_path)
text_file = os.path.join(
args.output_dir, filename.replace('.txt', '.tokenized.npz'))
with io.open(extraction_file_path, 'r', encoding='utf-8') as fi:
# Suppress warnings about length.
with open(os.devnull, "w") as f, contextlib.redirect_stderr(f):
# Safe to concat by adding EOT.
out = tokenizer.encode(fi.read()) + [EOT]
if len(out) < min_length:
omitted_files += 1
continue
combined += out
if len(combined) > args.combine:
np.savez_compressed(text_file, combined)
combined = []
# Save the rest.
if combined:
np.savez_compressed(text_file, combined)
return omitted_files, bar.total
def tokenizeSpacy(args):
nlp = spacy.load('en')
extraction_file_paths = glob.glob(args.input_glob)
for extraction_file_path in extraction_file_paths:
path, filename = os.path.split(extraction_file_path)
text_file = os.path.join(
args.output_dir, filename.replace('.txt', '.tokenized.txt'))
fi = io.open(extraction_file_path, 'r', encoding='utf-8')
fo = io.open(text_file, 'w', encoding='utf-8')
omitted_line_count = 0
for line in fi:
if len(line) > 1:
doc = nlp(line)
fo.write(' '.join([x.text for x in doc]))
else:
omitted_line_count += 1
fi.close()
fo.close()
print('Omitting '+str(omitted_line_count) + ' empty lines')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--input_glob', type=str, default='*.txt')
parser.add_argument('--output_dir', type=str, default='tokenized')
parser.add_argument('--tokenizer', type=str,
default='spacy', choices=['spacy', 'gpt2'])
parser.add_argument('--combine', type=int, default=1e8, help="min tokens per file in gpt2 mode")
parser.add_argument('--file_bs', type=int, default=10000, help="files per batch in gpt2 mode")
args = parser.parse_args()
if args.tokenizer == 'spacy':
tokenizeSpacy(args)
elif args.tokenizer == 'gpt2':
tokenizeGpt2Spawn(args)