From b23e1c3ba7f5c972dc2b2f7ad8c0d499c19ab228 Mon Sep 17 00:00:00 2001 From: Hai Liang Wang Date: Thu, 25 Oct 2018 11:29:41 +0800 Subject: [PATCH] =?UTF-8?q?=E8=AE=A1=E7=AE=97=E7=BC=96=E8=BE=91=E8=B7=9D?= =?UTF-8?q?=E7=A6=BB=EF=BC=8C=E5=8E=BB=E5=81=9C=E7=94=A8=E8=AF=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 6 ++++++ Requirements.txt | 2 +- setup.py | 2 +- synonyms/synonyms.py | 26 +++++++++++++++++++++++--- 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b537d45..8e2d01b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +# 3.10 +* 计算编辑距离时去停用词 + +# 3.9 +* fix bug + # 3.8 * 获得一个分词后句子的向量,向量以BoW方式组成 diff --git a/Requirements.txt b/Requirements.txt index 578c8c3..b22cf6d 100644 --- a/Requirements.txt +++ b/Requirements.txt @@ -1 +1 @@ -synonyms>=3.6 \ No newline at end of file +synonyms>=3.10 \ No newline at end of file diff --git a/setup.py b/setup.py index 9ed1e80..ca834f0 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setup( name='synonyms', - version='3.8.0', + version='3.10.0', description='Chinese Synonyms for Natural Language Processing and Understanding', long_description=LONGDOC, author='Hai Liang Wang, Hu Ying Xi', diff --git a/synonyms/synonyms.py b/synonyms/synonyms.py index d8f5000..4a217d5 100755 --- a/synonyms/synonyms.py +++ b/synonyms/synonyms.py @@ -78,7 +78,7 @@ ''' tokenizer_dict = os.path.join(curdir, 'data', 'vocab.txt') if "SYNONYMS_WORDSEG_DICT" in ENVIRON: - if os.exist(ENVIRON["SYNONYMS_WORDSEG_DICT"]): + if os.path.exists(ENVIRON["SYNONYMS_WORDSEG_DICT"]): print("info: set wordseg dict with %s" % tokenizer_dict) tokenizer_dict = ENVIRON["SYNONYMS_WORDSEG_DICT"] else: print("warning: can not find dict at [%s]" % tokenizer_dict) @@ -303,23 +303,43 @@ def nearby(word): _cache_nearby[w] = (words, scores) return words, scores -def compare(s1, s2, seg=True, ignore=False): +def compare(s1, s2, seg=True, ignore=False, stopwords=False): ''' compare similarity s1 : sentence1 s2 : sentence2 seg : True : The original sentences need jieba.cut Flase : The original sentences have been cut. + ignore: True: ignore OOV words + False: get vector randomly for OOV words ''' if s1 == s2: return 1.0 + + s1_words = [] + s2_words = [] + if seg: s1 = [x for x in jieba.cut(s1)] s2 = [x for x in jieba.cut(s2)] else: s1 = s1.split() s2 = s2.split() + + # check stopwords + if not stopwords: + global _stopwords + for x in s1: + if not x in _stopwords: + s1_words.append(x) + for x in s2: + if not x in _stopwords: + s2_words.append(x) + else: + s1_words = s1 + s2_words = s2 + assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0." - return _similarity_distance(s1, s2, ignore) + return _similarity_distance(s1_words, s2_words, ignore) def display(word): print("'%s'近义词:" % word)