计算编辑距离，去停用词

chatopera · Oct 25, 2018 · b23e1c3 · b23e1c3
1 parent 9979984
commit b23e1c3
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+# 3.10
+* 计算编辑距离时去停用词
+
+# 3.9
+* fix bug
+
 # 3.8
 * 获得一个分词后句子的向量，向量以BoW方式组成
 

diff --git a/Requirements.txt b/Requirements.txt
@@ -1 +1 @@
-synonyms>=3.6
+synonyms>=3.10
diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@
 
 setup(
     name='synonyms',
-    version='3.8.0',
+    version='3.10.0',
     description='Chinese Synonyms for Natural Language Processing and Understanding',
     long_description=LONGDOC,
     author='Hai Liang Wang, Hu Ying Xi',

diff --git a/synonyms/synonyms.py b/synonyms/synonyms.py
@@ -78,7 +78,7 @@
 '''
 tokenizer_dict = os.path.join(curdir, 'data', 'vocab.txt')
 if "SYNONYMS_WORDSEG_DICT" in ENVIRON:
-    if os.exist(ENVIRON["SYNONYMS_WORDSEG_DICT"]):
+    if os.path.exists(ENVIRON["SYNONYMS_WORDSEG_DICT"]):
         print("info: set wordseg dict with %s" % tokenizer_dict)
         tokenizer_dict = ENVIRON["SYNONYMS_WORDSEG_DICT"]
     else: print("warning: can not find dict at [%s]" % tokenizer_dict)
@@ -303,23 +303,43 @@ def nearby(word):
     _cache_nearby[w] = (words, scores)
     return words, scores
 
-def compare(s1, s2, seg=True, ignore=False):
+def compare(s1, s2, seg=True, ignore=False, stopwords=False):
     '''
     compare similarity
     s1 : sentence1
     s2 : sentence2
     seg : True : The original sentences need jieba.cut
           Flase : The original sentences have been cut.
+    ignore: True: ignore OOV words
+            False: get vector randomly for OOV words
     '''
     if s1 == s2: return 1.0
+
+    s1_words = []
+    s2_words = []
+
     if seg:
         s1 = [x for x in jieba.cut(s1)]
         s2 = [x for x in jieba.cut(s2)]
     else:
         s1 = s1.split()
         s2 = s2.split()
+
+    # check stopwords
+    if not stopwords:
+        global _stopwords
+        for x in s1: 
+            if not x in _stopwords:
+                s1_words.append(x)
+        for x in s2:
+            if not x in _stopwords:
+                s2_words.append(x)
+    else:
+        s1_words = s1 
+        s2_words = s2
+
     assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0."
-    return _similarity_distance(s1, s2, ignore)
+    return _similarity_distance(s1_words, s2_words, ignore)
 
 def display(word):
     print("'%s'近义词：" % word)