From a164685af4e23951b6210b325e97bddceefb9c58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Tue, 29 Jun 2021 07:23:41 +0200 Subject: [PATCH] Fix Unicode string incompatibility in gensim.similarities.fastss.editdist (#3178) * Do not expect the same Unicode type in editdist * Unit-test editdist * Use pytest assertion syntax in unit tests * Update CHANGELOG.md Co-authored-by: Michael Penkov --- CHANGELOG.md | 1 + gensim/similarities/fastss.pyx | 12 +++++------- gensim/test/test_similarities.py | 27 +++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 02fbee9125..3b4a0a0842 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ Changes * [#3116](https://github.com/RaRe-Technologies/gensim/pull/3116): Fix bug where saved Phrases model did not load its connector_words, by [@aloknayak29](https://github.com/aloknayak29) * [#3136](https://github.com/RaRe-Technologies/gensim/pull/3136): Fix indexing error in word2vec_inner.pyx, by [@bluekura](https://github.com/bluekura) * [#3174](https://github.com/RaRe-Technologies/gensim/pull/3174): Fix a bug when upgrading phraser from gensim 3.x to 4.0, by [@emgucv](https://github.com/emgucv) +* [#3178](https://github.com/RaRe-Technologies/gensim/pull/3178): Fix Unicode string incompatibility in gensim.similarities.fastss.editdist, by [@Witiko](https://github.com/Witiko) * [#3176](https://github.com/RaRe-Technologies/gensim/pull/3176): Eliminate obsolete step parameter from doc2vec infer_vector and similarity_unseen_docs, by [@rock420](https://github.com/rock420) * [#2830](https://github.com/RaRe-Technologies/gensim/pull/2830): Fixed KeyError in coherence model, by [@pietrotrope](https://github.com/pietrotrope) diff --git a/gensim/similarities/fastss.pyx b/gensim/similarities/fastss.pyx index 63469243c2..07203073d7 100644 --- a/gensim/similarities/fastss.pyx +++ b/gensim/similarities/fastss.pyx @@ -29,8 +29,8 @@ cdef extern from *: WIDTH * CYTHON_RESTRICT pos_new; WIDTH * CYTHON_RESTRICT pos_old; int row_flip = 1; /* Does pos_new represent row1 or row2? */ - int kind = PyUnicode_KIND(s1); /* How many bytes per unicode codepoint? */ - if (kind != PyUnicode_KIND(s2)) return -1; + int kind1 = PyUnicode_KIND(s1); /* How many bytes per unicode codepoint? */ + int kind2 = PyUnicode_KIND(s2); WIDTH len_s1 = (WIDTH)PyUnicode_GET_LENGTH(s1); WIDTH len_s2 = (WIDTH)PyUnicode_GET_LENGTH(s2); @@ -39,7 +39,7 @@ cdef extern from *: const WIDTH tmpi = len_s1; len_s1 = len_s2; len_s2 = tmpi; } if (len_s2 - len_s1 > maximum) return maximum + 1; - if (len_s2 > MAX_WORD_LENGTH) return -2; + if (len_s2 > MAX_WORD_LENGTH) return -1; void * s1_data = PyUnicode_DATA(s1); void * s2_data = PyUnicode_DATA(s2); @@ -47,7 +47,7 @@ cdef extern from *: for (WIDTH i2 = 0; i2 < len_s2; i2++) { int all_bad = i2 >= maximum; - const Py_UCS4 ch = PyUnicode_READ(kind, s2_data, i2); + const Py_UCS4 ch = PyUnicode_READ(kind2, s2_data, i2); row_flip = 1 - row_flip; if (row_flip) { pos_new = row2; pos_old = row1; @@ -58,7 +58,7 @@ cdef extern from *: for (WIDTH i1 = 0; i1 < len_s1; i1++) { WIDTH val = *(pos_old++); - if (ch != PyUnicode_READ(kind, s1_data, i1)) { + if (ch != PyUnicode_READ(kind1, s1_data, i1)) { const WIDTH _val1 = *pos_old; const WIDTH _val2 = *pos_new; if (_val1 < val) val = _val1; @@ -96,8 +96,6 @@ def editdist(s1: str, s2: str, max_dist=None): if result >= 0: return result elif result == -1: - raise ValueError("incompatible types of unicode strings") - elif result == -2: raise ValueError(f"editdist doesn't support strings longer than {MAX_WORD_LENGTH} characters") else: raise ValueError(f"editdist returned an error: {result}") diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index 2dda8d8a30..35ddd03397 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -33,6 +33,7 @@ from gensim.similarities import SparseTermSimilarityMatrix from gensim.similarities import LevenshteinSimilarityIndex from gensim.similarities.docsim import _nlargest +from gensim.similarities.fastss import editdist try: from pyemd import emd # noqa:F401 @@ -1631,6 +1632,32 @@ def test_most_similar(self): self.assertTrue(numpy.allclose(first_similarities**2.0, second_similarities)) +class TestFastSS(unittest.TestCase): + def test_editdist_same_unicode_kind_latin1(self): + """Test editdist returns the expected result with two Latin-1 strings.""" + expected = 2 + actual = editdist('Zizka', 'siska') + assert expected == actual + + def test_editdist_same_unicode_kind_ucs2(self): + """Test editdist returns the expected result with two UCS-2 strings.""" + expected = 2 + actual = editdist('Žižka', 'šiška') + assert expected == actual + + def test_editdist_same_unicode_kind_ucs4(self): + """Test editdist returns the expected result with two UCS-4 strings.""" + expected = 2 + actual = editdist('Žižka 😀', 'šiška 😀') + assert expected == actual + + def test_editdist_different_unicode_kinds(self): + """Test editdist returns the expected result with strings of different Unicode kinds.""" + expected = 2 + actual = editdist('Žižka', 'siska') + assert expected == actual + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main()