seatgeek · josegonzalez · Nov 1, 2016 · Oct 12, 2016 · Oct 29, 2016 · Oct 29, 2016
diff --git a/.gitignore b/.gitignore
@@ -7,7 +7,6 @@
 \#*
 .#*
 *#
-dist
 
 # Build files
 build
@@ -28,3 +27,13 @@ doc/aws_hostname.1
 
 # tox
 .tox
+
+# Hypothesis - keep the examples database
+.hypothesis/tmp
+.hypothesis/unicodedata
+
+# py.test
+.cache/
+
+# Pycharm
+.idea/
diff --git a/.travis.yml b/.travis.yml
@@ -1,15 +1,24 @@
 language: python
-python:
-  - 2.6
-  - 2.7
-  - 3.3
-  - 3.4
-  - 3.5
-  - pypy
-  - pypy3
+matrix:
+  include:
+  - python: "2.7"
+    env: TEST_SUITE=py.test
+  - python: "3.3"
+    env: TEST_SUITE=py.test
+  - python: "3.4"
+    env: TEST_SUITE=py.test
+  - python: "3.5"
+    env: TEST_SUITE=py.test
+  - python: "pypy"
+    env: TEST_SUITE=py.test
+  - python: "2.6"
+    env: TEST_SUITE="py.test test_fuzzywuzzy.py test_fuzzywuzzy_pytest.py"
+  - python: "pypy3"
+    env: TEST_SUITE="py.test test_fuzzywuzzy.py test_fuzzywuzzy_pytest.py"
 install:
   - pip install pytest pycodestyle
+  - if [ $TRAVIS_PYTHON_VERSION != 2.6 -a $TRAVIS_PYTHON_VERSION != "pypy3" ]; then pip install hypothesis; fi;
 script:
-  - py.test
+  - $TEST_SUITE
 notifications:
   on_success: always
diff --git a/fuzzywuzzy/process.py b/fuzzywuzzy/process.py
@@ -28,9 +28,15 @@
 from . import fuzz
 from . import utils
 import heapq
+import warnings
 
+warnings.simplefilter('always')
 
-def extractWithoutOrder(query, choices, processor=None, scorer=None, score_cutoff=0):
+default_scorer = fuzz.WRatio
+default_processor = utils.full_process
+
+
+def extractWithoutOrder(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0):
     """Select the best match in a list or dictionary of choices.
 
     Find best matches in a list or dictionary of choices, return a
@@ -76,33 +82,34 @@ def extractWithoutOrder(query, choices, processor=None, scorer=None, score_cutof
 
         ('train', 22, 'bard'), ('man', 0, 'dog')
     """
+    # Catch generators without lengths
     def no_process(x):
         return x
 
-    if choices is None:
-        raise StopIteration
-
-    # Catch generators without lengths
     try:
-        if len(choices) == 0:
+        if choices is None or len(choices) == 0:
             raise StopIteration
     except TypeError:
         pass
 
-    # default: wratio
-    if not scorer:
-        scorer = fuzz.WRatio
-        # fuzz.WRatio already process string so no need extra step
-        if not processor:
-            processor = no_process
-
-    # default, turn whatever the choice is into a workable string
-    if not processor:
-        processor = utils.full_process
+    # If the processor was removed by setting it to None
+    # perfom a noop as it still needs to be a function
+    if processor is None:
+        processor = no_process
 
     # Run the processor on the input query.
     processed_query = processor(query)
 
+    if len(processed_query) == 0:
+        warnings.warn("Applied processor reduces input query to empty string, all comparisons will have score 0.")
+
+    # If the scorer performs full_ratio with force ascii don't run full_process twice
+    if scorer in [fuzz.WRatio, fuzz.QRatio,
+                  fuzz.token_set_ratio, fuzz.token_sort_ratio,
+                  fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio] \
+            and processor == utils.full_process:
+        processor = no_process
+
     try:
         # See if choices is a dictionary-like object.
         for key, choice in choices.items():
@@ -119,7 +126,7 @@ def no_process(x):
                 yield (choice, score)
 
 
-def extract(query, choices, processor=None, scorer=None, limit=5):
+def extract(query, choices, processor=default_processor, scorer=default_scorer, limit=5):
     """Select the best match in a list or dictionary of choices.
 
     Find best matches in a list or dictionary of choices, return a
@@ -169,7 +176,7 @@ def extract(query, choices, processor=None, scorer=None, limit=5):
         sorted(sl, key=lambda i: i[1], reverse=True)
 
 
-def extractBests(query, choices, processor=None, scorer=None, score_cutoff=0, limit=5):
+def extractBests(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0, limit=5):
     """Get a list of the best matches to a collection of choices.
 
     Convenience function for getting the choices with best scores.
@@ -194,7 +201,7 @@ def extractBests(query, choices, processor=None, scorer=None, score_cutoff=0, li
         sorted(best_list, key=lambda i: i[1], reverse=True)
 
 
-def extractOne(query, choices, processor=None, scorer=None, score_cutoff=0):
+def extractOne(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0):
     """Find the single best match above a score in a list of choices.
 
     This is a convenience method which returns the single best choice.

diff --git a/test_fuzzywuzzy_hypothesis.py b/test_fuzzywuzzy_hypothesis.py
@@ -0,0 +1,77 @@
+from itertools import product
+from functools import partial
+
+from hypothesis import given, assume, settings
+import hypothesis.strategies as st
+import pytest
+
+from fuzzywuzzy import fuzz, process, utils
+
+
+def scorers_processors():
+    """
+    Generate a list of (scorer, processor) pairs for testing
+
+    :return: [(scorer, processor), ...]
+    """
+    scorers = [fuzz.ratio,
+               fuzz.partial_ratio]
+    processors = [lambda x: x,
+                  partial(utils.full_process, force_ascii=False),
+                  partial(utils.full_process, force_ascii=True)]
+    splist = list(product(scorers, processors))
+    splist.extend(
+        [(fuzz.WRatio, partial(utils.full_process, force_ascii=True)),
+         (fuzz.QRatio, partial(utils.full_process, force_ascii=True)),
+         (fuzz.UWRatio, partial(utils.full_process, force_ascii=False)),
+         (fuzz.UQRatio, partial(utils.full_process, force_ascii=False)),
+         (fuzz.token_set_ratio, partial(utils.full_process, force_ascii=True)),
+         (fuzz.token_sort_ratio, partial(utils.full_process, force_ascii=True)),
+         (fuzz.partial_token_set_ratio, partial(utils.full_process, force_ascii=True)),
+         (fuzz.partial_token_sort_ratio, partial(utils.full_process, force_ascii=True))]
+    )
+
+    return splist
+
+
+@pytest.mark.parametrize('scorer,processor',
+                         scorers_processors())
+@given(data=st.data())
+@settings(max_examples=100)
+def test_identical_strings_extracted(scorer, processor, data):
+    """
+    Test that identical strings will always return a perfect match.
+
+    :param scorer:
+    :param processor:
+    :param data:
+    :return:
+    """
+    # Draw a list of random strings
+    strings = data.draw(
+        st.lists(st.text(min_size=10, max_size=100),
+                 min_size=1, max_size=50))
+    # Draw a random integer for the index in that list
+    choiceidx = data.draw(st.integers(min_value=0, max_value=(len(strings) - 1)))
+
+    # Extract our choice from the list
+    choice = strings[choiceidx]
+
+    # Check process doesn't make our choice the empty string
+    assume(processor(choice) != '')
+
+    # Extract all perfect matches
+    result = process.extractBests(choice,
+                                  strings,
+                                  scorer=scorer,
+                                  processor=processor,
+                                  score_cutoff=100,
+                                  limit=None)
+
+    # Check we get a result
+    assert result != []
+
+    # Check the original is in the list
+    assert (choice, 100) in result
+
+
diff --git a/test_fuzzywuzzy_pytest.py b/test_fuzzywuzzy_pytest.py
@@ -0,0 +1,12 @@
+import warnings
+from fuzzywuzzy import process
+
+
+def test_process_warning():
+    """Check that a string reduced to 0 by processor raises a warning"""
+    query = ':::::::'
+    choices = [':::::::']
+    with warnings.catch_warnings(record=True) as w:
+        result = process.extractOne(query, choices)
+        assert issubclass(w[-1].category, UserWarning)
+        assert result == (query, 0)