From b23e1c3ba7f5c972dc2b2f7ad8c0d499c19ab228 Mon Sep 17 00:00:00 2001
From: Hai Liang Wang <hailiang.hl.wang@gmail.com>
Date: Thu, 25 Oct 2018 11:29:41 +0800
Subject: [PATCH] =?UTF-8?q?=E8=AE=A1=E7=AE=97=E7=BC=96=E8=BE=91=E8=B7=9D?=
 =?UTF-8?q?=E7=A6=BB=EF=BC=8C=E5=8E=BB=E5=81=9C=E7=94=A8=E8=AF=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 CHANGELOG.md         |  6 ++++++
 Requirements.txt     |  2 +-
 setup.py             |  2 +-
 synonyms/synonyms.py | 26 +++++++++++++++++++++++---
 4 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b537d45..8e2d01b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,9 @@
+# 3.10
+* 计算编辑距离时去停用词
+
+# 3.9
+* fix bug
+
 # 3.8
 * 获得一个分词后句子的向量，向量以BoW方式组成
 
diff --git a/Requirements.txt b/Requirements.txt
index 578c8c3..b22cf6d 100644
--- a/Requirements.txt
+++ b/Requirements.txt
@@ -1 +1 @@
-synonyms>=3.6
\ No newline at end of file
+synonyms>=3.10
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 9ed1e80..ca834f0 100644
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@
 
 setup(
     name='synonyms',
-    version='3.8.0',
+    version='3.10.0',
     description='Chinese Synonyms for Natural Language Processing and Understanding',
     long_description=LONGDOC,
     author='Hai Liang Wang, Hu Ying Xi',
diff --git a/synonyms/synonyms.py b/synonyms/synonyms.py
index d8f5000..4a217d5 100755
--- a/synonyms/synonyms.py
+++ b/synonyms/synonyms.py
@@ -78,7 +78,7 @@
 '''
 tokenizer_dict = os.path.join(curdir, 'data', 'vocab.txt')
 if "SYNONYMS_WORDSEG_DICT" in ENVIRON:
-    if os.exist(ENVIRON["SYNONYMS_WORDSEG_DICT"]):
+    if os.path.exists(ENVIRON["SYNONYMS_WORDSEG_DICT"]):
         print("info: set wordseg dict with %s" % tokenizer_dict)
         tokenizer_dict = ENVIRON["SYNONYMS_WORDSEG_DICT"]
     else: print("warning: can not find dict at [%s]" % tokenizer_dict)
@@ -303,23 +303,43 @@ def nearby(word):
     _cache_nearby[w] = (words, scores)
     return words, scores
 
-def compare(s1, s2, seg=True, ignore=False):
+def compare(s1, s2, seg=True, ignore=False, stopwords=False):
     '''
     compare similarity
     s1 : sentence1
     s2 : sentence2
     seg : True : The original sentences need jieba.cut
           Flase : The original sentences have been cut.
+    ignore: True: ignore OOV words
+            False: get vector randomly for OOV words
     '''
     if s1 == s2: return 1.0
+    
+    s1_words = []
+    s2_words = []
+
     if seg:
         s1 = [x for x in jieba.cut(s1)]
         s2 = [x for x in jieba.cut(s2)]
     else:
         s1 = s1.split()
         s2 = s2.split()
+
+    # check stopwords
+    if not stopwords:
+        global _stopwords
+        for x in s1: 
+            if not x in _stopwords:
+                s1_words.append(x)
+        for x in s2:
+            if not x in _stopwords:
+                s2_words.append(x)
+    else:
+        s1_words = s1 
+        s2_words = s2
+
     assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0."
-    return _similarity_distance(s1, s2, ignore)
+    return _similarity_distance(s1_words, s2_words, ignore)
 
 def display(word):
     print("'%s'近义词：" % word)