Skip to content

Commit

Permalink
Fix Unicode string incompatibility in gensim.similarities.fastss.edit…
Browse files Browse the repository at this point in the history
…dist (#3178)

* Do not expect the same Unicode type in editdist

* Unit-test editdist

* Use pytest assertion syntax in unit tests

* Update CHANGELOG.md

Co-authored-by: Michael Penkov <m@penkov.dev>
  • Loading branch information
Witiko and mpenkov authored Jun 29, 2021
1 parent 52fade6 commit a164685
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 7 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Changes
* [#3116](https://github.com/RaRe-Technologies/gensim/pull/3116): Fix bug where saved Phrases model did not load its connector_words, by [@aloknayak29](https://github.com/aloknayak29)
* [#3136](https://github.com/RaRe-Technologies/gensim/pull/3136): Fix indexing error in word2vec_inner.pyx, by [@bluekura](https://github.com/bluekura)
* [#3174](https://github.com/RaRe-Technologies/gensim/pull/3174): Fix a bug when upgrading phraser from gensim 3.x to 4.0, by [@emgucv](https://github.com/emgucv)
* [#3178](https://github.com/RaRe-Technologies/gensim/pull/3178): Fix Unicode string incompatibility in gensim.similarities.fastss.editdist, by [@Witiko](https://github.com/Witiko)
* [#3176](https://github.com/RaRe-Technologies/gensim/pull/3176): Eliminate obsolete step parameter from doc2vec infer_vector and similarity_unseen_docs, by [@rock420](https://github.com/rock420)
* [#2830](https://github.com/RaRe-Technologies/gensim/pull/2830): Fixed KeyError in coherence model, by [@pietrotrope](https://github.com/pietrotrope)

Expand Down
12 changes: 5 additions & 7 deletions gensim/similarities/fastss.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ cdef extern from *:
WIDTH * CYTHON_RESTRICT pos_new;
WIDTH * CYTHON_RESTRICT pos_old;
int row_flip = 1; /* Does pos_new represent row1 or row2? */
int kind = PyUnicode_KIND(s1); /* How many bytes per unicode codepoint? */
if (kind != PyUnicode_KIND(s2)) return -1;
int kind1 = PyUnicode_KIND(s1); /* How many bytes per unicode codepoint? */
int kind2 = PyUnicode_KIND(s2);
WIDTH len_s1 = (WIDTH)PyUnicode_GET_LENGTH(s1);
WIDTH len_s2 = (WIDTH)PyUnicode_GET_LENGTH(s2);
Expand All @@ -39,15 +39,15 @@ cdef extern from *:
const WIDTH tmpi = len_s1; len_s1 = len_s2; len_s2 = tmpi;
}
if (len_s2 - len_s1 > maximum) return maximum + 1;
if (len_s2 > MAX_WORD_LENGTH) return -2;
if (len_s2 > MAX_WORD_LENGTH) return -1;
void * s1_data = PyUnicode_DATA(s1);
void * s2_data = PyUnicode_DATA(s2);
for (WIDTH tmpi = 0; tmpi <= len_s1; tmpi++) row2[tmpi] = tmpi;
for (WIDTH i2 = 0; i2 < len_s2; i2++) {
int all_bad = i2 >= maximum;
const Py_UCS4 ch = PyUnicode_READ(kind, s2_data, i2);
const Py_UCS4 ch = PyUnicode_READ(kind2, s2_data, i2);
row_flip = 1 - row_flip;
if (row_flip) {
pos_new = row2; pos_old = row1;
Expand All @@ -58,7 +58,7 @@ cdef extern from *:
for (WIDTH i1 = 0; i1 < len_s1; i1++) {
WIDTH val = *(pos_old++);
if (ch != PyUnicode_READ(kind, s1_data, i1)) {
if (ch != PyUnicode_READ(kind1, s1_data, i1)) {
const WIDTH _val1 = *pos_old;
const WIDTH _val2 = *pos_new;
if (_val1 < val) val = _val1;
Expand Down Expand Up @@ -96,8 +96,6 @@ def editdist(s1: str, s2: str, max_dist=None):
if result >= 0:
return result
elif result == -1:
raise ValueError("incompatible types of unicode strings")
elif result == -2:
raise ValueError(f"editdist doesn't support strings longer than {MAX_WORD_LENGTH} characters")
else:
raise ValueError(f"editdist returned an error: {result}")
Expand Down
27 changes: 27 additions & 0 deletions gensim/test/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import LevenshteinSimilarityIndex
from gensim.similarities.docsim import _nlargest
from gensim.similarities.fastss import editdist

try:
from pyemd import emd # noqa:F401
Expand Down Expand Up @@ -1631,6 +1632,32 @@ def test_most_similar(self):
self.assertTrue(numpy.allclose(first_similarities**2.0, second_similarities))


class TestFastSS(unittest.TestCase):
def test_editdist_same_unicode_kind_latin1(self):
"""Test editdist returns the expected result with two Latin-1 strings."""
expected = 2
actual = editdist('Zizka', 'siska')
assert expected == actual

def test_editdist_same_unicode_kind_ucs2(self):
"""Test editdist returns the expected result with two UCS-2 strings."""
expected = 2
actual = editdist('Žižka', 'šiška')
assert expected == actual

def test_editdist_same_unicode_kind_ucs4(self):
"""Test editdist returns the expected result with two UCS-4 strings."""
expected = 2
actual = editdist('Žižka 😀', 'šiška 😀')
assert expected == actual

def test_editdist_different_unicode_kinds(self):
"""Test editdist returns the expected result with strings of different Unicode kinds."""
expected = 2
actual = editdist('Žižka', 'siska')
assert expected == actual


if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()

0 comments on commit a164685

Please sign in to comment.