Skip to content

Commit

Permalink
Modify find_similar_str()
Browse files Browse the repository at this point in the history
  • Loading branch information
mikeqfu committed Jun 17, 2023
1 parent aeebaa5 commit b99678b
Showing 1 changed file with 21 additions and 20 deletions.
41 changes: 21 additions & 20 deletions pyhelpers/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,10 +587,10 @@ def _find_str_by_difflib(x, lookup_list, n=1, ignore_punctuation=True, **kwargs)
return sim_str


def _find_str_by_thefuzz(x, lookup_list, n=1, **kwargs):
def _find_str_by_rapidfuzz(x, lookup_list, n=1, **kwargs):
"""
Find ``n`` strings that are similar to ``x`` from among a sequence of candidates
by using `TheFuzz <https://pypi.org/project/thefuzz/>`_.
by using `RapidFuzz <https://pypi.org/project/rapidfuzz/>`_.
:param x: a string-type variable
:type x: str
Expand All @@ -600,15 +600,15 @@ def _find_str_by_thefuzz(x, lookup_list, n=1, **kwargs):
when ``n=None``, the function returns a sorted ``lookup_list``
(in the descending order of similarity)
:type n: int | None
:param kwargs: [optional] parameters of `thefuzz.fuzz.token_set_ratio`_
:param kwargs: [optional] parameters of `rapidfuzz.fuzz.QRatio`_
:return: a string-type variable that should be similar to (or the same as) ``x``
:rtype: str | list | None
.. _`thefuzz.fuzz.token_set_ratio`: https://github.com/seatgeek/thefuzz#token-set-ratio
.. _`rapidfuzz.fuzz.QRatio`: https://github.com/maxbachmann/RapidFuzz#quick-ratio
**Tests**::
>>> from pyhelpers.text import _find_str_by_thefuzz
>>> from pyhelpers.text import _find_str_by_rapidfuzz
>>> lookup_lst = ['Anglia',
... 'East Coast',
Expand All @@ -621,20 +621,21 @@ def _find_str_by_thefuzz(x, lookup_list, n=1, **kwargs):
... 'Wessex',
... 'Western']
>>> y = _find_str_by_thefuzz(x='angle', lookup_list=lookup_lst, n=1)
>>> y = _find_str_by_rapidfuzz(x='angle', lookup_list=lookup_lst, n=1)
>>> y
'Anglia'
>>> y = _find_str_by_thefuzz(x='123', lookup_list=lookup_lst, n=1)
>>> y = _find_str_by_rapidfuzz(x='123', lookup_list=lookup_lst, n=1)
>>> y is None
True
"""

thefuzz_fuzz = _check_dependency(name='thefuzz.fuzz')
rapidfuzz_fuzz, rapidfuzz_utils = map(_check_dependency, ['rapidfuzz.fuzz', 'rapidfuzz.utils'])

lookup_list_ = list(lookup_list)

l_distances = [thefuzz_fuzz.token_set_ratio(s1=x, s2=a, **kwargs) for a in lookup_list_]
kwargs.update({'processor': rapidfuzz_utils.default_process})
l_distances = [rapidfuzz_fuzz.QRatio(s1=x, s2=a, **kwargs) for a in lookup_list_]

if sum(l_distances) == 0:
sim_str = None
Expand Down Expand Up @@ -663,29 +664,29 @@ def find_similar_str(x, lookup_list, n=1, ignore_punctuation=True, engine='diffl
defaults to ``True``
:type ignore_punctuation: bool
:param engine: options include ``'difflib'`` (default) and
``'thefuzz'`` (previously ``'fuzzywuzzy'``) (or simply ``'fuzz'``)
``'rapidfuzz'`` (or simply ``'fuzz'``)
- if ``engine='difflib'``, the function relies on `difflib.get_close_matches`_
- if ``engine='thefuzz'`` (or ``engine='fuzz'``), the function relies on
`thefuzz.fuzz.token_set_ratio`_
- if ``engine='rapidfuzz'`` (or ``engine='fuzz'``), the function relies on
`rapidfuzz.fuzz.QRatio`_
:type engine: str | typing.Callable
:param kwargs: [optional] parameters of `difflib.get_close_matches`_ (e.g. ``cutoff=0.6``) or
`thefuzz.fuzz.token_set_ratio`_, depending on ``engine``
`rapidfuzz.fuzz.QRatio`_, depending on ``engine``
:return: a string-type variable that should be similar to (or the same as) ``x``
:rtype: str | list | None
.. _`difflib.get_close_matches`:
https://docs.python.org/3/library/difflib.html#difflib.get_close_matches
.. _`thefuzz.fuzz.token_set_ratio`:
https://github.com/seatgeek/thefuzz#token-set-ratio
.. _`rapidfuzz.fuzz.QRatio`:
https://github.com/maxbachmann/RapidFuzz#quick-ratio
.. note::
- By default, the function uses the built-in module
`difflib <https://docs.python.org/3/library/difflib.html>`_; when we set the parameter
``engine='thefuzz'`` (or ``engine='fuzz'``), the function then relies on
`TheFuzz <https://pypi.org/project/thefuzz/>`_, which is not an essential dependency
``engine='rapidfuzz'`` (or ``engine='fuzz'``), the function then relies on
`RapidFuzz <https://pypi.org/project/rapidfuzz/>`_, which is not an essential dependency
for installing pyhelpers. We could however use ``pip`` (or ``conda``) to install it first
separately.
Expand Down Expand Up @@ -736,15 +737,15 @@ def find_similar_str(x, lookup_list, n=1, ignore_punctuation=True, engine='diffl
['Wessex', 'Western']
"""

methods = {'difflib', 'fuzzywuzzy', 'thefuzz', 'fuzz', None}
methods = {'difflib', 'fuzzywuzzy', 'rapidfuzz', 'fuzz', None}
assert engine in methods or callable(engine), \
f"Invalid input: `engine`. Valid options can include {methods}."

if engine in {'difflib', None}:
sim_str = _find_str_by_difflib(x, lookup_list, n, ignore_punctuation, **kwargs)

elif engine in {'fuzzywuzzy', 'thefuzz', 'fuzz'}:
sim_str = _find_str_by_thefuzz(x, lookup_list, n, **kwargs)
elif engine in {'rapidfuzz', 'fuzz'}:
sim_str = _find_str_by_rapidfuzz(x, lookup_list, n, **kwargs)

else:
sim_str = engine(x, lookup_list, **kwargs)
Expand Down

0 comments on commit b99678b

Please sign in to comment.