Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added ignore_script and tested it. #17

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 46 additions & 17 deletions langcodes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,14 +674,16 @@ def match_score(self, supported: 'Language') -> int:
)
return 100 - min(self.distance(supported), 100)

def distance(self, supported: 'Language') -> int:
def distance(self, supported: 'Language', ignore_script: bool = False) -> int:
"""
Suppose that `self` is the language that the user desires, and
`supported` is a language that is actually supported.

This method returns a number from 0 to 134 measuring the 'distance'
between the languages (lower numbers are better). This is not a
symmetric relation.
symmetric relation. If `ignore_script` is `True`, the script will
not be used in the comparison, possibly resulting in a smaller
'distance'.

The language distance is not really about the linguistic similarity or
history of the languages; instead, it's based largely on sociopolitical
Expand All @@ -703,25 +705,39 @@ def distance(self, supported: 'Language') -> int:
desired_triple = ('und', 'Zzzz', 'ZZ')
else:
desired_complete = self.prefer_macrolanguage().maximize()
desired_triple = (
desired_complete.language,
desired_complete.script,
desired_complete.territory,
)

if ignore_script:
desired_triple = (
desired_complete.language,
None,
desired_complete.territory,
)
else:
desired_triple = (
desired_complete.language,
desired_complete.script,
desired_complete.territory,
)
Comment on lines +708 to +719
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer the following:

Suggested change
if ignore_script:
desired_triple = (
desired_complete.language,
None,
desired_complete.territory,
)
else:
desired_triple = (
desired_complete.language,
desired_complete.script,
desired_complete.territory,
)
desired_triple = (
desired_complete.language,
None if ignore_script else desired_complete.script,
desired_complete.territory,
)


if (
supported.language is None
and supported.script is None
and supported.territory is None
):
supported_triple = ('und', 'Zzzz', 'ZZ')
supported_triple = ('und', 'Zzzz', 'ZZ')
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think thats a mistake, can you removed the added whitespaces?

else:
supported_complete = supported.prefer_macrolanguage().maximize()
supported_triple = (
supported_complete.language,
supported_complete.script,
supported_complete.territory,
)
if ignore_script:
supported_triple = (
supported_complete.language,
None,
supported_complete.territory,
)
else:
supported_triple = (
supported_complete.language,
supported_complete.script,
supported_complete.territory,
)
Comment on lines +729 to +740
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be simplified as shown above.


return tuple_distance_cached(desired_triple, supported_triple)

Expand Down Expand Up @@ -1648,7 +1664,7 @@ def tag_match_score(
return desired_ld.match_score(supported_ld)


def tag_distance(desired: Union[str, Language], supported: Union[str, Language]) -> int:
def tag_distance(desired: Union[str, Language], supported: Union[str, Language], ignore_script: bool = False) -> int:
"""
Tags that expand to the same thing when likely values are filled in get a
distance of 0.
Expand Down Expand Up @@ -1791,14 +1807,20 @@ def tag_distance(desired: Union[str, Language], supported: Union[str, Language])

>>> tag_distance('ja', 'ja-Latn-US-hepburn')
54

If `ignore_script` is used, the script difference is ignored and a smaller
differenge with lower score will be found.
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo:

Suggested change
differenge with lower score will be found.
difference with lower score will be found.


>>> tag_distance('ja', 'ja-Latn-hepburn', ignore_script=True)
0

>>> # You can read the Shavian script, right?
>>> tag_distance('en', 'en-Shaw')
54
"""
desired_obj = Language.get(desired)
supported_obj = Language.get(supported)
return desired_obj.distance(supported_obj)
return desired_obj.distance(supported_obj, ignore_script)


def best_match(
Expand Down Expand Up @@ -1835,6 +1857,7 @@ def closest_match(
desired_language: Union[str, Language],
supported_languages: Sequence[str],
max_distance: int = 25,
ignore_script: bool = False,
) -> Tuple[str, int]:
"""
You have software that supports any of the `supported_languages`. You want
Expand All @@ -1853,6 +1876,9 @@ def closest_match(
value is 25, and raising it can cause data to be processed in significantly
the wrong language. The documentation for `tag_distance` describes the
distance values in more detail.

`ignore_script` makes the matching ignore scripts, allowing matches to be
found when they wouldn't otherwise be due to different scripts.

When there is a tie for the best matching language, the first one in the
tie will be used.
Expand All @@ -1871,6 +1897,9 @@ def closest_match(

>>> closest_match('ja', ['ja-Latn-hepburn', 'en'])
('und', 1000)

>>> closest_match('ja', ['ja-Latn-hepburn', 'en'], ignore_script=True)
('ja-Latn-hepburn', 0)
"""
desired_language = str(desired_language)

Expand All @@ -1884,7 +1913,7 @@ def closest_match(
return desired_language, 0

match_distances = [
(supported, tag_distance(desired_language, supported))
(supported, tag_distance(desired_language, supported, ignore_script))
for supported in supported_languages
]
match_distances = [
Expand Down
21 changes: 21 additions & 0 deletions langcodes/tests/test_issue_59.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from langcodes import closest_match


def test_language_less_than():
spoken_language_1 = 'pa'
spoken_language_2 = 'pa-PK'
match = closest_match(
spoken_language_1, [spoken_language_2], ignore_script=True
)
print(match)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think print in tests is hard to manage on executing time, do we need the debug output? If yes, I'd prefer using logging

assert match[0] != "und"


def test_language_more_than():
spoken_language_1 = 'pa-PK'
spoken_language_2 = 'pa'
match = closest_match(
spoken_language_1, [spoken_language_2], ignore_script=True
)
print(match)
assert match[0] != "und"
Loading