minor updates

nathanrooy · Oct 6, 2022 · 5fb1d98 · 5fb1d98
1 parent 7d661ff
commit 5fb1d98
Show file tree

Hide file tree

Showing 7 changed files with 119 additions and 49 deletions.
diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
@@ -1,9 +1,6 @@
 name: Build + PyPI
 
-on:
-  release:
-    types: [created]
-  workflow_dispatch:
+on: [workflow_dispatch]
 
 jobs:
   build_wheels:

diff --git a/README.md b/README.md
@@ -1 +1,51 @@
-# turboshtein
+# turboshtein
+
+[![gh-actions-ci](https://img.shields.io/github/workflow/status/nathanrooy/turboshtein/ci?style=flat-square)](https://github.com/nathanrooy/turboshtein/actions?query=workflow%3Aci)
+[![GitHub license](https://img.shields.io/github/license/nathanrooy/turboshtein?style=flat-square)](https://github.com/nathanrooy/turboshtein/blob/master/LICENSE)
+[![PyPI pyversions](https://img.shields.io/pypi/pyversions/turboshtein.svg?style=flat-square)](https://pypi.org/pypi/turboshtein/)
+[![PyPi Version](https://img.shields.io/pypi/v/turboshtein.svg?style=flat-square)](https://pypi.org/project/turboshtein)
+
+## Background
+I didn't study computer science so [bitwise operations](https://en.wikipedia.org/wiki/Bitwise_operation) have always been a bit of a gray area for me. I finally gave in and decided to spend some time reading up on the subject and ended up implementing the Myers bit-parallel algorithm<sup>[[1](https://github.com/nathanrooy/turboshtein#references)]</sup> for computing [Levenshtein edit distance](https://en.wikipedia.org/wiki/Levenshtein_distance). I was fairly happy with the results so I wrapped the original C code in Python and uploaded it to PyPI. Coincidentally, it's also pretty fast.
+
+## Usage
+Install via PyPI:
+
+```sh
+pip install turboshtein
+```
+
+Or not:
+```
+python setup.py install
+```
+
+Computing levenshtein edit distance between two strings:
+```py
+>>> from turboshtein import levenshtein
+>>> levenshtein("saturday", "sunday")
+3
+```
+
+Note that there are several limitations, mainly:
+- Max string length must be less than 64.
+- ASCII characters only.
+
+## Performance
+Values represent the number of string pairs processed per second.
+
+|                                 library | version |     m=n=8 |    m=n=16 |    m=n=24 |    m=n=32 |    m=n=40 |    m=n=48 |    m=n=56 |
+|----------------------------------------:|--------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|
+|                 turboshtein.levenshtein |   0.0.4 | 7,625,612 | 6,134,367 | 5,160,757 | 4,455,692 | 3,897,966 | 3,477,885 | 3,147,237 |
+| [Levenshtein.distance](https://github.com/maxbachmann/Levenshtein) |  0.20.5 | 6,448,035 | 5,335,239 | 4,556,680 | 4,034,226 | 3,550,695 | 3,211,664 | 2,910,581 |
+| [rapidfuzz.distance.Levenshtein.distance](https://github.com/maxbachmann/RapidFuzz) |  2.11.1 | 6,446,954 | 5,268,454 | 4,560,213 | 4,027,110 | 3,540,574 | 3,215,206 | 2,912,386 |
+| [jellyfish.levenshtein_distance](https://github.com/jamesturk/jellyfish) |   0.9.0 | 3,823,843 | 1,494,087 |   718,785 |   399,693 |   262,541 |   183,554 |   135,081 |
+| [textdistance.levenshtein](https://github.com/life4/textdistance) |   4.5.0 |   444,305 |   413,950 |   411,441 |   401,539 |   395,777 |   389,462 |   385,233 |
+
+Test setup:
+- XPS 15 9570
+- Intel i7-8750H
+- Fedora Linux 36 (Workstation Edition)
+
+## References
+[1]: Gene Myers. 1999. A fast bit-vector algorithm for approximate string matching based on dynamic programming. J. ACM 46, 3 (May 1999), 395-415. https://doi.org/10.1145/316542.316550
diff --git a/setup.py b/setup.py
@@ -23,6 +23,6 @@
     long_description_content_type="text/markdown",
     name='turboshtein',
     python_requires='>=3.4',
-    url='https://github.com/nathanrooy/turboshtein',
-    version='0.0.2'
-)
+    url='https://githubcom/nathanrooy/turboshtein',
+    version='0.0.4'
+)
diff --git a/test/test_all.py b/test/test_all.py
@@ -1,27 +1,18 @@
 import unittest
-from random import choices, sample
 from turboshtein import levenshtein
-
-
-ascii = '''0123456789\
-abcdefghijklmnopqrstuvwxyz\
-ABCDEFGHIJKLMNOPQRSTUVWXYZ\
-!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~'''
+from .utils import create_strings
 
 
 class TestLevenshtein(unittest.TestCase):
     def test_ascii(self):
         for s_len in range(1, 64):
             for edit_dist in range(0, s_len + 1):
-                s1 = choices(ascii, k=s_len)
-                s2 = s1[:]
-                for idx in sample([i for i in range(len(s1))], edit_dist):
-                    s2[idx] = choices([c for c in ascii if c not in s1], k=1)[0]
+                s1, s2 = create_strings(s_len, edit_dist)
                 self.assertEqual(
                     edit_dist,
-                    levenshtein(''.join(s1), ''.join(s2))
+                    levenshtein(s1, s2)
                 )
 
 
 if __name__ == '__main__':
-    unittest.main()
+    unittest.main()
diff --git a/test/test_performance.py b/test/test_performance.py
@@ -1,17 +1,12 @@
-from random import choices, randint, sample
-from timeit import timeit
 from math import floor
+from random import randint
+from timeit import repeat
 import importlib
 import pandas as pd
+from utils import create_strings
 
 
-ascii = '''0123456789\
-abcdefghijklmnopqrstuvwxyz\
-ABCDEFGHIJKLMNOPQRSTUVWXYZ\
-!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~'''
-
-
-SAMPLE_SIZE = 100_00
+SAMPLE_SIZE = 10_000
 S_LENS = [8, 16, 24, 32, 40, 48, 56]
 LIBS = [
     ('jellyfish','levenshtein_distance'),
@@ -22,16 +17,8 @@
 ]
 
 
-def create_strings(s_len, edit_dist):
-    s1 = choices(ascii, k=s_len)
-    s2 = s1[:]
-    for idx in sample([i for i in range(len(s1))], edit_dist):
-        s2[idx] = choices([c for c in ascii if c not in s1], k=1)[0]
-    return [''.join(s1), ''.join(s2)]
-
-
 funcs = []
-for mname, fname in sorted(LIBS):
+for mname, fname in LIBS:
     module = importlib.import_module(mname)
     for _fname in fname.split('.'):
         module = getattr(module, _fname)
@@ -42,18 +29,13 @@ def create_strings(s_len, edit_dist):
 for s_len in S_LENS:
     w = [create_strings(s_len, randint(0, s_len)) for _ in range(SAMPLE_SIZE)]
     for f, lib in zip(*[funcs, LIBS]):
-        t = timeit(lambda: [f(w1, w2) for w1, w2 in w], number=1)
-        df.append(['.'.join(lib), s_len, t])
+        t = repeat(lambda: [f(w1, w2) for w1, w2 in w], repeat=1_000, number=1)
+        df.append(['.'.join(lib), s_len, min(t)])
 
 df = pd.DataFrame(df, columns=['library', 'string_length', 'duration'])
 df['pairs/sec'] = df['duration'].apply(lambda x: floor(SAMPLE_SIZE / x))
 df = (
-    df
-    .pivot_table(
-        index='library',
-        columns='string_length',
-        values='pairs/sec'
-    )
+    df.pivot_table(index='library', columns='string_length', values='pairs/sec')
     .reset_index()
     .sort_values(by=[8], ascending=False)
 )
@@ -62,4 +44,4 @@ def create_strings(s_len, edit_dist):
 print(
     df.rename(columns={c:f"m=n={c}" for c in df.columns if c!='library'})
     .to_markdown(index=False, stralign="right")
-)
+)
diff --git a/test/utils.py b/test/utils.py
@@ -0,0 +1,15 @@
+from random import choices, sample
+
+
+ascii_chars = '''0123456789\
+abcdefghijklmnopqrstuvwxyz\
+ABCDEFGHIJKLMNOPQRSTUVWXYZ\
+!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~'''
+
+
+def create_strings(s_len, edit_dist):
+    s1 = choices(ascii_chars, k=s_len)
+    s2 = s1[:]
+    for idx in sample([i for i in range(len(s1))], edit_dist):
+        s2[idx] = choices([c for c in ascii_chars if c not in s1], k=1)[0]
+    return [''.join(s1), ''.join(s2)]
diff --git a/turboshtein.c b/turboshtein.c
@@ -92,12 +92,47 @@ static PyObject* turboshtein_levenshtein(PyObject *self, PyObject *args) {
 }
 
 
+PyDoc_STRVAR(levenshtein_doc,
+    "Computes the Levenshtein distance between two strings.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "a : str\n"
+    "    first string\n"
+    "b : str\n"
+    "    second string\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "edit distance : int\n"
+    "    The Levenshtein edit distance between strings `a` and `b`\n"
+    "\n"
+    "Examples\n"
+    "--------\n"
+    ">>> from turboshtein import levenshtein\n"
+    ">>> levenshtein('saturday','sunday')\n"
+    "3\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "This function utilizes the bit-vector algorithm developed by Myers[1]."
+    "\nIt's very fast, but this implementation has a few limitations:\n"
+    "   1. Both strings must consist of exclusively ascii characters.\n"
+    "   2. Input string lengths must be less than 64 characters each.\n"
+    "\n"
+    "[1] Gene Myers. 1999. A fast bit-vector algorithm for approximate string\n"
+    "matching based on dynamic programming. J. ACM 46, 3 (May 1999), 395-415.\n"
+    "https://doi.org/10.1145/316542.316550\n"
+    "\n"
+);
+
+
 static PyMethodDef turboshtein_methods[] = {
     {
         "levenshtein",
         turboshtein_levenshtein,
         METH_VARARGS,
-        "Levenshtein edit distance between two ascii strings"
+        levenshtein_doc
     },
     {
         NULL,