Skip to content

Commit

Permalink
🐛 Regression on some detection case showcased in the documentation (#371
Browse files Browse the repository at this point in the history
) (#378)

and added noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife, thanks!)
  • Loading branch information
Ousret authored Oct 31, 2023
1 parent a4b9b01 commit 79dce48
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 2 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

### Fixed
- Unintentional memory usage regression when using large payload that match several encoding (#376)
- Regression on some detection case showcased in the documentation (#371)

### Added
- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)

## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)

Expand Down
33 changes: 32 additions & 1 deletion charset_normalizer/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
)
from .utils import (
is_accentuated,
is_arabic,
is_arabic_isolated_form,
is_case_variable,
is_cjk,
is_emoticon,
Expand Down Expand Up @@ -127,8 +129,9 @@ def reset(self) -> None: # pragma: no cover

@property
def ratio(self) -> float:
if self._character_count == 0 or self._character_count < 8:
if self._character_count < 8:
return 0.0

ratio_of_accentuation: float = self._accentuated_count / self._character_count
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0

Expand Down Expand Up @@ -455,6 +458,34 @@ def ratio(self) -> float:
return self._successive_upper_lower_count_final / self._character_count


class ArabicIsolatedFormPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._character_count: int = 0
self._isolated_form_count: int = 0

def reset(self) -> None: # pragma: no cover
self._character_count = 0
self._isolated_form_count = 0

def eligible(self, character: str) -> bool:
return is_arabic(character)

def feed(self, character: str) -> None:
self._character_count += 1

if is_arabic_isolated_form(character):
self._isolated_form_count += 1

@property
def ratio(self) -> float:
if self._character_count < 8:
return 0.0

isolated_form_usage: float = self._isolated_form_count / self._character_count

return isolated_form_usage


@lru_cache(maxsize=1024)
def is_suspiciously_successive_range(
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
Expand Down
22 changes: 22 additions & 0 deletions charset_normalizer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def is_accentuated(character: str) -> bool:
or "WITH DIAERESIS" in description
or "WITH CIRCUMFLEX" in description
or "WITH TILDE" in description
or "WITH MACRON" in description
or "WITH RING ABOVE" in description
)


Expand Down Expand Up @@ -174,6 +176,26 @@ def is_thai(character: str) -> bool:
return "THAI" in character_name


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_arabic(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False

return "ARABIC" in character_name


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_arabic_isolated_form(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False

return "ARABIC" in character_name and "ISOLATED FORM" in character_name


@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
def is_unicode_range_secondary(range_name: str) -> bool:
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
Expand Down
8 changes: 8 additions & 0 deletions tests/test_base_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,11 @@ def test_alphabets_property():
assert "Basic Latin" in best_guess.alphabets
assert "Emoticons range(Emoji)" in best_guess.alphabets
assert best_guess.alphabets.count("Basic Latin") == 1


def test_doc_example_short_cp1251():
best_guess = from_bytes(
'Bсеки човек има право на образование.'.encode('cp1251')
).best()

assert best_guess.encoding == "cp1251"
3 changes: 3 additions & 0 deletions tests/test_large_payload.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def test_large_payload_u8_sig_basic_entry():
assert best_guess.encoding == "utf_8", "Large U8 payload case detection wrongly detected!"
assert best_guess.bom is True, "SIG/BOM property should be True"
assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw"
assert best_guess._string is not None, "str should be decoded before direct access (sig available)"


def test_large_payload_ascii_basic_entry():
Expand All @@ -22,6 +23,7 @@ def test_large_payload_ascii_basic_entry():
assert best_guess.encoding == "ascii", "Large ASCII payload case detection wrongly detected!"
assert best_guess.bom is False, "SIG/BOM property should be False"
assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw"
assert best_guess._string is None, "str should not be decoded until direct access"


def test_misleading_large_sequence():
Expand All @@ -32,5 +34,6 @@ def test_misleading_large_sequence():
assert len(guesses) > 0
match = guesses.best()
assert match is not None
assert match._string is not None, "str should be cached as only match"
assert match.encoding == 'utf_8'
assert str(match) is not None
2 changes: 1 addition & 1 deletion tests/test_mess_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
("´Á¥½³ø§i -- ±i®Ìºû, ³¯·Ø©v", 0.5, 1.),
("ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli", 0.1, 0.5),
("<i>Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.</i>", 0.01, 0.5),
("""ØĢØŠØģاØĶŲ„ Ų„Ųˆ ØĢŲ† اŲ„Ų†Ø§Øģ ŲŠŲˆŲ… Ų…ا ØģŲˆŲŲŠØŠØģاØĶŲ„ŲˆŲ†ØŒ ØŊØđŲ†Ø§ Ų†ØģŲ…Øđ ØđŲ† (ŲØąŲˆØŊŲˆ) ŲˆØ§Ų„ØŪا؊Ų…""", 0.8, 2.0),
("""ØĢØŠØģاØĶŲ„ Ų„Ųˆ ØĢŲ† اŲ„Ų†Ø§Øģ ŲŠŲˆŲ… Ų…ا ØģŲˆŲŲŠØŠØģاØĶŲ„ŲˆŲ†ØŒ ØŊØđŲ†Ø§ Ų†ØģŲ…Øđ ØđŲ† (ŲØąŲˆØŊŲˆ) ŲˆØ§Ų„ØŪا؊Ų…""", 0.8, 3.0),
("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""", 0.8, 2.5),
("""hishamkoc@yahoo.com ุชุฑุฌู…ู€ู€ุฉ ู‡ู€ุดู€ู€ู€ุงู… ุงู„ู€ู‚ู€ู€ู€ู€ู„ุงูRadoZ ุชู€ู€ู€ุนู€ู€ู€ู€ุฏูŠู€ู€ู„ ุงู„ู€ู€ู€ุชู€ู€ู€ู€ูˆู‚ู€ู€ูŠู€ู€ู€ู€ุช ู…ู€ู€ู€ู† ู‚ู€ู€ุจู€ู€ู„""", 0.5, 2.0)
Expand Down

0 comments on commit 79dce48

Please sign in to comment.