Skip to content

Commit

Permalink
计算编辑距离,去停用词
Browse files Browse the repository at this point in the history
  • Loading branch information
hailiang-wang committed Oct 25, 2018
1 parent 9979984 commit b23e1c3
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 5 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# 3.10
* 计算编辑距离时去停用词

# 3.9
* fix bug

# 3.8
* 获得一个分词后句子的向量,向量以BoW方式组成

Expand Down
2 changes: 1 addition & 1 deletion Requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
synonyms>=3.6
synonyms>=3.10
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

setup(
name='synonyms',
version='3.8.0',
version='3.10.0',
description='Chinese Synonyms for Natural Language Processing and Understanding',
long_description=LONGDOC,
author='Hai Liang Wang, Hu Ying Xi',
Expand Down
26 changes: 23 additions & 3 deletions synonyms/synonyms.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
'''
tokenizer_dict = os.path.join(curdir, 'data', 'vocab.txt')
if "SYNONYMS_WORDSEG_DICT" in ENVIRON:
if os.exist(ENVIRON["SYNONYMS_WORDSEG_DICT"]):
if os.path.exists(ENVIRON["SYNONYMS_WORDSEG_DICT"]):
print("info: set wordseg dict with %s" % tokenizer_dict)
tokenizer_dict = ENVIRON["SYNONYMS_WORDSEG_DICT"]
else: print("warning: can not find dict at [%s]" % tokenizer_dict)
Expand Down Expand Up @@ -303,23 +303,43 @@ def nearby(word):
_cache_nearby[w] = (words, scores)
return words, scores

def compare(s1, s2, seg=True, ignore=False):
def compare(s1, s2, seg=True, ignore=False, stopwords=False):
'''
compare similarity
s1 : sentence1
s2 : sentence2
seg : True : The original sentences need jieba.cut
Flase : The original sentences have been cut.
ignore: True: ignore OOV words
False: get vector randomly for OOV words
'''
if s1 == s2: return 1.0

s1_words = []
s2_words = []

if seg:
s1 = [x for x in jieba.cut(s1)]
s2 = [x for x in jieba.cut(s2)]
else:
s1 = s1.split()
s2 = s2.split()

# check stopwords
if not stopwords:
global _stopwords
for x in s1:
if not x in _stopwords:
s1_words.append(x)
for x in s2:
if not x in _stopwords:
s2_words.append(x)
else:
s1_words = s1
s2_words = s2

assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0."
return _similarity_distance(s1, s2, ignore)
return _similarity_distance(s1_words, s2_words, ignore)

def display(word):
print("'%s'近义词:" % word)
Expand Down

0 comments on commit b23e1c3

Please sign in to comment.