-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added ignore_script
and tested it.
#17
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -674,14 +674,16 @@ def match_score(self, supported: 'Language') -> int: | |||||
) | ||||||
return 100 - min(self.distance(supported), 100) | ||||||
|
||||||
def distance(self, supported: 'Language') -> int: | ||||||
def distance(self, supported: 'Language', ignore_script: bool = False) -> int: | ||||||
""" | ||||||
Suppose that `self` is the language that the user desires, and | ||||||
`supported` is a language that is actually supported. | ||||||
|
||||||
This method returns a number from 0 to 134 measuring the 'distance' | ||||||
between the languages (lower numbers are better). This is not a | ||||||
symmetric relation. | ||||||
symmetric relation. If `ignore_script` is `True`, the script will | ||||||
not be used in the comparison, possibly resulting in a smaller | ||||||
'distance'. | ||||||
|
||||||
The language distance is not really about the linguistic similarity or | ||||||
history of the languages; instead, it's based largely on sociopolitical | ||||||
|
@@ -703,25 +705,39 @@ def distance(self, supported: 'Language') -> int: | |||||
desired_triple = ('und', 'Zzzz', 'ZZ') | ||||||
else: | ||||||
desired_complete = self.prefer_macrolanguage().maximize() | ||||||
desired_triple = ( | ||||||
desired_complete.language, | ||||||
desired_complete.script, | ||||||
desired_complete.territory, | ||||||
) | ||||||
|
||||||
if ignore_script: | ||||||
desired_triple = ( | ||||||
desired_complete.language, | ||||||
None, | ||||||
desired_complete.territory, | ||||||
) | ||||||
else: | ||||||
desired_triple = ( | ||||||
desired_complete.language, | ||||||
desired_complete.script, | ||||||
desired_complete.territory, | ||||||
) | ||||||
|
||||||
if ( | ||||||
supported.language is None | ||||||
and supported.script is None | ||||||
and supported.territory is None | ||||||
): | ||||||
supported_triple = ('und', 'Zzzz', 'ZZ') | ||||||
supported_triple = ('und', 'Zzzz', 'ZZ') | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think thats a mistake, can you removed the added whitespaces? |
||||||
else: | ||||||
supported_complete = supported.prefer_macrolanguage().maximize() | ||||||
supported_triple = ( | ||||||
supported_complete.language, | ||||||
supported_complete.script, | ||||||
supported_complete.territory, | ||||||
) | ||||||
if ignore_script: | ||||||
supported_triple = ( | ||||||
supported_complete.language, | ||||||
None, | ||||||
supported_complete.territory, | ||||||
) | ||||||
else: | ||||||
supported_triple = ( | ||||||
supported_complete.language, | ||||||
supported_complete.script, | ||||||
supported_complete.territory, | ||||||
) | ||||||
Comment on lines
+729
to
+740
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be simplified as shown above. |
||||||
|
||||||
return tuple_distance_cached(desired_triple, supported_triple) | ||||||
|
||||||
|
@@ -1648,7 +1664,7 @@ def tag_match_score( | |||||
return desired_ld.match_score(supported_ld) | ||||||
|
||||||
|
||||||
def tag_distance(desired: Union[str, Language], supported: Union[str, Language]) -> int: | ||||||
def tag_distance(desired: Union[str, Language], supported: Union[str, Language], ignore_script: bool = False) -> int: | ||||||
""" | ||||||
Tags that expand to the same thing when likely values are filled in get a | ||||||
distance of 0. | ||||||
|
@@ -1791,14 +1807,20 @@ def tag_distance(desired: Union[str, Language], supported: Union[str, Language]) | |||||
|
||||||
>>> tag_distance('ja', 'ja-Latn-US-hepburn') | ||||||
54 | ||||||
|
||||||
If `ignore_script` is used, the script difference is ignored and a smaller | ||||||
differenge with lower score will be found. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Typo:
Suggested change
|
||||||
|
||||||
>>> tag_distance('ja', 'ja-Latn-hepburn', ignore_script=True) | ||||||
0 | ||||||
|
||||||
>>> # You can read the Shavian script, right? | ||||||
>>> tag_distance('en', 'en-Shaw') | ||||||
54 | ||||||
""" | ||||||
desired_obj = Language.get(desired) | ||||||
supported_obj = Language.get(supported) | ||||||
return desired_obj.distance(supported_obj) | ||||||
return desired_obj.distance(supported_obj, ignore_script) | ||||||
|
||||||
|
||||||
def best_match( | ||||||
|
@@ -1835,6 +1857,7 @@ def closest_match( | |||||
desired_language: Union[str, Language], | ||||||
supported_languages: Sequence[str], | ||||||
max_distance: int = 25, | ||||||
ignore_script: bool = False, | ||||||
) -> Tuple[str, int]: | ||||||
""" | ||||||
You have software that supports any of the `supported_languages`. You want | ||||||
|
@@ -1853,6 +1876,9 @@ def closest_match( | |||||
value is 25, and raising it can cause data to be processed in significantly | ||||||
the wrong language. The documentation for `tag_distance` describes the | ||||||
distance values in more detail. | ||||||
|
||||||
`ignore_script` makes the matching ignore scripts, allowing matches to be | ||||||
found when they wouldn't otherwise be due to different scripts. | ||||||
|
||||||
When there is a tie for the best matching language, the first one in the | ||||||
tie will be used. | ||||||
|
@@ -1871,6 +1897,9 @@ def closest_match( | |||||
|
||||||
>>> closest_match('ja', ['ja-Latn-hepburn', 'en']) | ||||||
('und', 1000) | ||||||
|
||||||
>>> closest_match('ja', ['ja-Latn-hepburn', 'en'], ignore_script=True) | ||||||
('ja-Latn-hepburn', 0) | ||||||
""" | ||||||
desired_language = str(desired_language) | ||||||
|
||||||
|
@@ -1884,7 +1913,7 @@ def closest_match( | |||||
return desired_language, 0 | ||||||
|
||||||
match_distances = [ | ||||||
(supported, tag_distance(desired_language, supported)) | ||||||
(supported, tag_distance(desired_language, supported, ignore_script)) | ||||||
for supported in supported_languages | ||||||
] | ||||||
match_distances = [ | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
from langcodes import closest_match | ||
|
||
|
||
def test_language_less_than(): | ||
spoken_language_1 = 'pa' | ||
spoken_language_2 = 'pa-PK' | ||
match = closest_match( | ||
spoken_language_1, [spoken_language_2], ignore_script=True | ||
) | ||
print(match) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think print in tests is hard to manage on executing time, do we need the debug output? If yes, I'd prefer using logging |
||
assert match[0] != "und" | ||
|
||
|
||
def test_language_more_than(): | ||
spoken_language_1 = 'pa-PK' | ||
spoken_language_2 = 'pa' | ||
match = closest_match( | ||
spoken_language_1, [spoken_language_2], ignore_script=True | ||
) | ||
print(match) | ||
assert match[0] != "und" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd prefer the following: