From 8f4b4680cb47906c33f1d8f27b8e722c9deb2163 Mon Sep 17 00:00:00 2001 From: = Date: Wed, 11 Aug 2021 13:02:21 +0430 Subject: [PATCH 1/2] Fix some tokenization issues --- hazm/WordTokenizer.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/hazm/WordTokenizer.py b/hazm/WordTokenizer.py index cda1e986..89a90ed6 100644 --- a/hazm/WordTokenizer.py +++ b/hazm/WordTokenizer.py @@ -52,16 +52,19 @@ def __init__(self, words_file=default_words, verbs_file=default_verbs, join_verb u"\U0001F4CC\U0001F4CD" # other emojis "]", flags=re.UNICODE) self.emoji_repl = r'\g<0> ' - self.id_pattern = re.compile(r'([^\w\._]+)(@[\w_]+)') - self.id_repl = r'\1ID' - self.link_pattern = re.compile(r'((https?|ftp):\/\/)?(? Date: Wed, 11 Aug 2021 20:37:52 +0430 Subject: [PATCH 2/2] Only allow english letters in link token --- hazm/WordTokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hazm/WordTokenizer.py b/hazm/WordTokenizer.py index 89a90ed6..31a88487 100644 --- a/hazm/WordTokenizer.py +++ b/hazm/WordTokenizer.py @@ -54,7 +54,7 @@ def __init__(self, words_file=default_words, verbs_file=default_verbs, join_verb self.emoji_repl = r'\g<0> ' self.id_pattern = re.compile(r'(?