-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.py
37 lines (30 loc) · 943 Bytes
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
try:
from . import facebook as fb
from . import cubadebate as cu
from . import telegram as tg
from . import twitter as tw
from .tokenizer.custom_tokenizer import SpacyCustomTokenizer, get_progressbar
except (ModuleNotFoundError, ImportError):
import facebook as fb
import cubadebate as cu
import telegram as tg
import twitter as tw
from tokenizer.custom_tokenizer import SpacyCustomTokenizer, get_progressbar
def get_all_text(path=''):
return fb.get_text(path) + tg.get_text(path) + tw.get_text(path) + cu.get_text(path)
print("------ Init Tokenizer ----------")
texts = get_all_text('')
print("------ Load Tokenizer ----------")
nlp = SpacyCustomTokenizer()
nlp.__load__()
print("------ Init Process ----------")
bar = get_progressbar(len(texts))
bar.start()
i = 0
for text, _ in texts:
i += 1
for token in nlp(text):
pass
bar.update(i)
bar.finish()
nlp.__save__()