Skip to content
This repository has been archived by the owner on Aug 26, 2024. It is now read-only.

Clarify default behaviour of extract / Add tests for matching strings #142

Merged
merged 13 commits into from
Nov 1, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
\#*
.#*
*#
dist

# Build files
build
Expand All @@ -28,3 +27,13 @@ doc/aws_hostname.1

# tox
.tox

# Hypothesis - keep the examples database
.hypothesis/tmp
.hypothesis/unicodedata

# py.test
.cache/

# Pycharm
.idea/
27 changes: 18 additions & 9 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
language: python
python:
- 2.6
- 2.7
- 3.3
- 3.4
- 3.5
- pypy
- pypy3
matrix:
include:
- python: "2.7"
env: TEST_SUITE=py.test
- python: "3.3"
env: TEST_SUITE=py.test
- python: "3.4"
env: TEST_SUITE=py.test
- python: "3.5"
env: TEST_SUITE=py.test
- python: "pypy"
env: TEST_SUITE=py.test
- python: "2.6"
env: TEST_SUITE="py.test test_fuzzywuzzy.py test_fuzzywuzzy_pytest.py"
- python: "pypy3"
env: TEST_SUITE="py.test test_fuzzywuzzy.py test_fuzzywuzzy_pytest.py"
install:
- pip install pytest pycodestyle
- if [ $TRAVIS_PYTHON_VERSION != 2.6 -a $TRAVIS_PYTHON_VERSION != "pypy3" ]; then pip install hypothesis; fi;
script:
- py.test
- $TEST_SUITE
notifications:
on_success: always
45 changes: 26 additions & 19 deletions fuzzywuzzy/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,15 @@
from . import fuzz
from . import utils
import heapq
import warnings

warnings.simplefilter('always')

def extractWithoutOrder(query, choices, processor=None, scorer=None, score_cutoff=0):
default_scorer = fuzz.WRatio
default_processor = utils.full_process


def extractWithoutOrder(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0):
"""Select the best match in a list or dictionary of choices.

Find best matches in a list or dictionary of choices, return a
Expand Down Expand Up @@ -76,33 +82,34 @@ def extractWithoutOrder(query, choices, processor=None, scorer=None, score_cutof

('train', 22, 'bard'), ('man', 0, 'dog')
"""
# Catch generators without lengths
def no_process(x):
return x

if choices is None:
raise StopIteration

# Catch generators without lengths
try:
if len(choices) == 0:
if choices is None or len(choices) == 0:
raise StopIteration
except TypeError:
pass

# default: wratio
if not scorer:
scorer = fuzz.WRatio
# fuzz.WRatio already process string so no need extra step
if not processor:
processor = no_process

# default, turn whatever the choice is into a workable string
if not processor:
processor = utils.full_process
# If the processor was removed by setting it to None
# perfom a noop as it still needs to be a function
if processor is None:
processor = no_process

# Run the processor on the input query.
processed_query = processor(query)

if len(processed_query) == 0:
warnings.warn("Applied processor reduces input query to empty string, all comparisons will have score 0.")

# If the scorer performs full_ratio with force ascii don't run full_process twice
if scorer in [fuzz.WRatio, fuzz.QRatio,
fuzz.token_set_ratio, fuzz.token_sort_ratio,
fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio] \
and processor == utils.full_process:
processor = no_process

try:
# See if choices is a dictionary-like object.
for key, choice in choices.items():
Expand All @@ -119,7 +126,7 @@ def no_process(x):
yield (choice, score)


def extract(query, choices, processor=None, scorer=None, limit=5):
def extract(query, choices, processor=default_processor, scorer=default_scorer, limit=5):
"""Select the best match in a list or dictionary of choices.

Find best matches in a list or dictionary of choices, return a
Expand Down Expand Up @@ -169,7 +176,7 @@ def extract(query, choices, processor=None, scorer=None, limit=5):
sorted(sl, key=lambda i: i[1], reverse=True)


def extractBests(query, choices, processor=None, scorer=None, score_cutoff=0, limit=5):
def extractBests(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0, limit=5):
"""Get a list of the best matches to a collection of choices.

Convenience function for getting the choices with best scores.
Expand All @@ -194,7 +201,7 @@ def extractBests(query, choices, processor=None, scorer=None, score_cutoff=0, li
sorted(best_list, key=lambda i: i[1], reverse=True)


def extractOne(query, choices, processor=None, scorer=None, score_cutoff=0):
def extractOne(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0):
"""Find the single best match above a score in a list of choices.

This is a convenience method which returns the single best choice.
Expand Down
77 changes: 77 additions & 0 deletions test_fuzzywuzzy_hypothesis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from itertools import product
from functools import partial

from hypothesis import given, assume, settings
import hypothesis.strategies as st
import pytest

from fuzzywuzzy import fuzz, process, utils


def scorers_processors():
"""
Generate a list of (scorer, processor) pairs for testing

:return: [(scorer, processor), ...]
"""
scorers = [fuzz.ratio,
fuzz.partial_ratio]
processors = [lambda x: x,
partial(utils.full_process, force_ascii=False),
partial(utils.full_process, force_ascii=True)]
splist = list(product(scorers, processors))
splist.extend(
[(fuzz.WRatio, partial(utils.full_process, force_ascii=True)),
(fuzz.QRatio, partial(utils.full_process, force_ascii=True)),
(fuzz.UWRatio, partial(utils.full_process, force_ascii=False)),
(fuzz.UQRatio, partial(utils.full_process, force_ascii=False)),
(fuzz.token_set_ratio, partial(utils.full_process, force_ascii=True)),
(fuzz.token_sort_ratio, partial(utils.full_process, force_ascii=True)),
(fuzz.partial_token_set_ratio, partial(utils.full_process, force_ascii=True)),
(fuzz.partial_token_sort_ratio, partial(utils.full_process, force_ascii=True))]
)

return splist


@pytest.mark.parametrize('scorer,processor',
scorers_processors())
@given(data=st.data())
@settings(max_examples=100)
def test_identical_strings_extracted(scorer, processor, data):
"""
Test that identical strings will always return a perfect match.

:param scorer:
:param processor:
:param data:
:return:
"""
# Draw a list of random strings
strings = data.draw(
st.lists(st.text(min_size=10, max_size=100),
min_size=1, max_size=50))
# Draw a random integer for the index in that list
choiceidx = data.draw(st.integers(min_value=0, max_value=(len(strings) - 1)))

# Extract our choice from the list
choice = strings[choiceidx]

# Check process doesn't make our choice the empty string
assume(processor(choice) != '')

# Extract all perfect matches
result = process.extractBests(choice,
strings,
scorer=scorer,
processor=processor,
score_cutoff=100,
limit=None)

# Check we get a result
assert result != []

# Check the original is in the list
assert (choice, 100) in result


12 changes: 12 additions & 0 deletions test_fuzzywuzzy_pytest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import warnings
from fuzzywuzzy import process


def test_process_warning():
"""Check that a string reduced to 0 by processor raises a warning"""
query = ':::::::'
choices = [':::::::']
with warnings.catch_warnings(record=True) as w:
result = process.extractOne(query, choices)
assert issubclass(w[-1].category, UserWarning)
assert result == (query, 0)