From a164685af4e23951b6210b325e97bddceefb9c58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Tue, 29 Jun 2021 07:23:41 +0200
Subject: [PATCH] Fix Unicode string incompatibility in
 gensim.similarities.fastss.editdist (#3178)

* Do not expect the same Unicode type in editdist

* Unit-test editdist

* Use pytest assertion syntax in unit tests

* Update CHANGELOG.md

Co-authored-by: Michael Penkov <m@penkov.dev>
---
 CHANGELOG.md                     |  1 +
 gensim/similarities/fastss.pyx   | 12 +++++-------
 gensim/test/test_similarities.py | 27 +++++++++++++++++++++++++++
 3 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 02fbee9125..3b4a0a0842 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ Changes
 * [#3116](https://github.com/RaRe-Technologies/gensim/pull/3116): Fix bug where saved Phrases model did not load its connector_words, by [@aloknayak29](https://github.com/aloknayak29)
 * [#3136](https://github.com/RaRe-Technologies/gensim/pull/3136): Fix indexing error in word2vec_inner.pyx, by [@bluekura](https://github.com/bluekura)
 * [#3174](https://github.com/RaRe-Technologies/gensim/pull/3174): Fix a bug when upgrading phraser from gensim 3.x to 4.0, by [@emgucv](https://github.com/emgucv)
+* [#3178](https://github.com/RaRe-Technologies/gensim/pull/3178): Fix Unicode string incompatibility in gensim.similarities.fastss.editdist, by [@Witiko](https://github.com/Witiko)
 * [#3176](https://github.com/RaRe-Technologies/gensim/pull/3176): Eliminate obsolete step parameter from doc2vec infer_vector and similarity_unseen_docs, by [@rock420](https://github.com/rock420)
 * [#2830](https://github.com/RaRe-Technologies/gensim/pull/2830): Fixed KeyError in coherence model, by [@pietrotrope](https://github.com/pietrotrope)
 
diff --git a/gensim/similarities/fastss.pyx b/gensim/similarities/fastss.pyx
index 63469243c2..07203073d7 100644
--- a/gensim/similarities/fastss.pyx
+++ b/gensim/similarities/fastss.pyx
@@ -29,8 +29,8 @@ cdef extern from *:
         WIDTH * CYTHON_RESTRICT pos_new;
         WIDTH * CYTHON_RESTRICT pos_old;
         int row_flip = 1;  /* Does pos_new represent row1 or row2? */
-        int kind = PyUnicode_KIND(s1);  /* How many bytes per unicode codepoint? */
-        if (kind != PyUnicode_KIND(s2)) return -1;
+        int kind1 = PyUnicode_KIND(s1);  /* How many bytes per unicode codepoint? */
+        int kind2 = PyUnicode_KIND(s2);
 
         WIDTH len_s1 = (WIDTH)PyUnicode_GET_LENGTH(s1);
         WIDTH len_s2 = (WIDTH)PyUnicode_GET_LENGTH(s2);
@@ -39,7 +39,7 @@ cdef extern from *:
             const WIDTH tmpi = len_s1; len_s1 = len_s2; len_s2 = tmpi;
         }
         if (len_s2 - len_s1 > maximum) return maximum + 1;
-        if (len_s2 > MAX_WORD_LENGTH) return -2;
+        if (len_s2 > MAX_WORD_LENGTH) return -1;
         void * s1_data = PyUnicode_DATA(s1);
         void * s2_data = PyUnicode_DATA(s2);
 
@@ -47,7 +47,7 @@ cdef extern from *:
 
         for (WIDTH i2 = 0; i2 < len_s2; i2++) {
             int all_bad = i2 >= maximum;
-            const Py_UCS4 ch = PyUnicode_READ(kind, s2_data, i2);
+            const Py_UCS4 ch = PyUnicode_READ(kind2, s2_data, i2);
             row_flip = 1 - row_flip;
             if (row_flip) {
                 pos_new = row2; pos_old = row1;
@@ -58,7 +58,7 @@ cdef extern from *:
 
             for (WIDTH i1 = 0; i1 < len_s1; i1++) {
                 WIDTH val = *(pos_old++);
-                if (ch != PyUnicode_READ(kind, s1_data, i1)) {
+                if (ch != PyUnicode_READ(kind1, s1_data, i1)) {
                     const WIDTH _val1 = *pos_old;
                     const WIDTH _val2 = *pos_new;
                     if (_val1 < val) val = _val1;
@@ -96,8 +96,6 @@ def editdist(s1: str, s2: str, max_dist=None):
     if result >= 0:
         return result
     elif result == -1:
-        raise ValueError("incompatible types of unicode strings")
-    elif result == -2:
         raise ValueError(f"editdist doesn't support strings longer than {MAX_WORD_LENGTH} characters")
     else:
         raise ValueError(f"editdist returned an error: {result}")
diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py
index 2dda8d8a30..35ddd03397 100644
--- a/gensim/test/test_similarities.py
+++ b/gensim/test/test_similarities.py
@@ -33,6 +33,7 @@
 from gensim.similarities import SparseTermSimilarityMatrix
 from gensim.similarities import LevenshteinSimilarityIndex
 from gensim.similarities.docsim import _nlargest
+from gensim.similarities.fastss import editdist
 
 try:
     from pyemd import emd  # noqa:F401
@@ -1631,6 +1632,32 @@ def test_most_similar(self):
         self.assertTrue(numpy.allclose(first_similarities**2.0, second_similarities))
 
 
+class TestFastSS(unittest.TestCase):
+    def test_editdist_same_unicode_kind_latin1(self):
+        """Test editdist returns the expected result with two Latin-1 strings."""
+        expected = 2
+        actual = editdist('Zizka', 'siska')
+        assert expected == actual
+
+    def test_editdist_same_unicode_kind_ucs2(self):
+        """Test editdist returns the expected result with two UCS-2 strings."""
+        expected = 2
+        actual = editdist('Žižka', 'šiška')
+        assert expected == actual
+
+    def test_editdist_same_unicode_kind_ucs4(self):
+        """Test editdist returns the expected result with two UCS-4 strings."""
+        expected = 2
+        actual = editdist('Žižka 😀', 'šiška 😀')
+        assert expected == actual
+
+    def test_editdist_different_unicode_kinds(self):
+        """Test editdist returns the expected result with strings of different Unicode kinds."""
+        expected = 2
+        actual = editdist('Žižka', 'siska')
+        assert expected == actual
+
+
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
     unittest.main()