From 2e8c326a7a07c7f544c22f161dcde2ff78acaf3b Mon Sep 17 00:00:00 2001 From: trungtv Date: Thu, 14 May 2020 18:08:44 +0700 Subject: [PATCH] fix bug --- pyvi/ViTokenizer.py | 12 ++++++------ setup.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pyvi/ViTokenizer.py b/pyvi/ViTokenizer.py index 41f17aa..7056917 100644 --- a/pyvi/ViTokenizer.py +++ b/pyvi/ViTokenizer.py @@ -108,11 +108,11 @@ def sylabelize(text): patterns = patterns.decode('utf-8') tokens = re.findall(patterns, text, re.UNICODE) - return [token[0] for token in tokens] + return text, [token[0] for token in tokens] @staticmethod def tokenize(str): - tmp = ViTokenizer.sylabelize(str) + text, tmp = ViTokenizer.sylabelize(str) if len(tmp) == 0: return str labels = ViTokenizer.model.predict([ViTokenizer.sent2features(tmp, False)]) @@ -129,7 +129,7 @@ def tokenize(str): @staticmethod def spacy_tokenize(str): - tmp = ViTokenizer.sylabelize(str) + text, tmp = ViTokenizer.sylabelize(str) if len(tmp) == 0: return str labels = ViTokenizer.model.predict([ViTokenizer.sent2features(tmp, False)]) @@ -146,14 +146,14 @@ def spacy_tokenize(str): tokens.append(token) token = tmp[i] tokens.append(token) - tmp = re.sub("\s\s+" , " ", str) + text = re.sub("\s\s+" , " ", text) # print(tmp) i = 0 for token in tokens: i = i + len(token) -# print("{}:{}:{}".format(token,tmp[i], i)) - if i < len(tmp) and tmp[i] == ' ': +# print("{}:{}:{}".format(token,text[i], i)) + if i < len(text) and text[i] == ' ': spaces.append(True) i += 1 else: diff --git a/setup.py b/setup.py index 0e3cba8..8c05ebe 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ # Versions should comply with PEP440. For a discussion on single-sourcing # the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='0.0.9.8', + version='0.0.9.9', description='Python Vietnamese Toolkit', long_description=long_description,