🐛 Regression on some detection case showcased in the documentation (#371

) (#378) and added noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife, thanks!)
jawah · Oct 31, 2023 · 79dce48 · 79dce48
1 parent a4b9b01
commit 79dce48
Show file tree

Hide file tree

Showing 6 changed files with 69 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,7 +6,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 - Unintentional memory usage regression when using large payload that match several encoding (#376)
+- Regression on some detection case showcased in the documentation (#371)
 
+### Added
+- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
 
 ## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
 

diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py
@@ -9,6 +9,8 @@
 )
 from .utils import (
     is_accentuated,
+    is_arabic,
+    is_arabic_isolated_form,
     is_case_variable,
     is_cjk,
     is_emoticon,
@@ -127,8 +129,9 @@ def reset(self) -> None:  # pragma: no cover
 
     @property
     def ratio(self) -> float:
-        if self._character_count == 0 or self._character_count < 8:
+        if self._character_count < 8:
             return 0.0
+
         ratio_of_accentuation: float = self._accentuated_count / self._character_count
         return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
 
@@ -455,6 +458,34 @@ def ratio(self) -> float:
         return self._successive_upper_lower_count_final / self._character_count
 
 
+class ArabicIsolatedFormPlugin(MessDetectorPlugin):
+    def __init__(self) -> None:
+        self._character_count: int = 0
+        self._isolated_form_count: int = 0
+
+    def reset(self) -> None:  # pragma: no cover
+        self._character_count = 0
+        self._isolated_form_count = 0
+
+    def eligible(self, character: str) -> bool:
+        return is_arabic(character)
+
+    def feed(self, character: str) -> None:
+        self._character_count += 1
+
+        if is_arabic_isolated_form(character):
+            self._isolated_form_count += 1
+
+    @property
+    def ratio(self) -> float:
+        if self._character_count < 8:
+            return 0.0
+
+        isolated_form_usage: float = self._isolated_form_count / self._character_count
+
+        return isolated_form_usage
+
+
 @lru_cache(maxsize=1024)
 def is_suspiciously_successive_range(
     unicode_range_a: Optional[str], unicode_range_b: Optional[str]

diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py
@@ -32,6 +32,8 @@ def is_accentuated(character: str) -> bool:
         or "WITH DIAERESIS" in description
         or "WITH CIRCUMFLEX" in description
         or "WITH TILDE" in description
+        or "WITH MACRON" in description
+        or "WITH RING ABOVE" in description
     )
 
 
@@ -174,6 +176,26 @@ def is_thai(character: str) -> bool:
     return "THAI" in character_name
 
 
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_arabic(character: str) -> bool:
+    try:
+        character_name = unicodedata.name(character)
+    except ValueError:
+        return False
+
+    return "ARABIC" in character_name
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_arabic_isolated_form(character: str) -> bool:
+    try:
+        character_name = unicodedata.name(character)
+    except ValueError:
+        return False
+
+    return "ARABIC" in character_name and "ISOLATED FORM" in character_name
+
+
 @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
 def is_unicode_range_secondary(range_name: str) -> bool:
     return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)

diff --git a/tests/test_base_detection.py b/tests/test_base_detection.py
@@ -115,3 +115,11 @@ def test_alphabets_property():
     assert "Basic Latin" in best_guess.alphabets
     assert "Emoticons range(Emoji)" in best_guess.alphabets
     assert best_guess.alphabets.count("Basic Latin") == 1
+
+
+def test_doc_example_short_cp1251():
+    best_guess = from_bytes(
+        'Bсеки човек има право на образование.'.encode('cp1251')
+    ).best()
+
+    assert best_guess.encoding == "cp1251"
diff --git a/tests/test_large_payload.py b/tests/test_large_payload.py
@@ -12,6 +12,7 @@ def test_large_payload_u8_sig_basic_entry():
     assert best_guess.encoding == "utf_8", "Large U8 payload case detection wrongly detected!"
     assert best_guess.bom is True, "SIG/BOM property should be True"
     assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw"
+    assert best_guess._string is not None, "str should be decoded before direct access (sig available)"
 
 
 def test_large_payload_ascii_basic_entry():
@@ -22,6 +23,7 @@ def test_large_payload_ascii_basic_entry():
     assert best_guess.encoding == "ascii", "Large ASCII payload case detection wrongly detected!"
     assert best_guess.bom is False, "SIG/BOM property should be False"
     assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw"
+    assert best_guess._string is None, "str should not be decoded until direct access"
 
 
 def test_misleading_large_sequence():
@@ -32,5 +34,6 @@ def test_misleading_large_sequence():
     assert len(guesses) > 0
     match = guesses.best()
     assert match is not None
+    assert match._string is not None, "str should be cached as only match"
     assert match.encoding == 'utf_8'
     assert str(match) is not None
diff --git a/tests/test_mess_detection.py b/tests/test_mess_detection.py
@@ -12,7 +12,7 @@
         ("´Á¥½³ø§i --  ±i®Ìºû, ³¯·Ø©v", 0.5, 1.),
         ("ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli", 0.1, 0.5),
         ("<i>Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.</i>", 0.01, 0.5),
-        ("""ØĢØŠØģØ§ØĶŲ ŲŲ ØĢŲ Ø§ŲŲØ§Øģ ŲŲŲ ŲØ§ ØģŲŲŲØŠØģØ§ØĶŲŲŲØ ØŊØđŲØ§ ŲØģŲØđ ØđŲ (ŲØąŲØŊŲ) ŲØ§ŲØŪØ§ØŠŲ""", 0.8, 2.0),
+        ("""ØĢØŠØģØ§ØĶŲ ŲŲ ØĢŲ Ø§ŲŲØ§Øģ ŲŲŲ ŲØ§ ØģŲŲŲØŠØģØ§ØĶŲŲŲØ ØŊØđŲØ§ ŲØģŲØđ ØđŲ (ŲØąŲØŊŲ) ŲØ§ŲØŪØ§ØŠŲ""", 0.8, 3.0),
         ("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""", 0.8, 2.5),
         ("""hishamkoc@yahoo.com ุชุฑุฌูููุฉ ููุดูููุงู ุงููููููููุงูRadoZ ุชูููุนููููุฏูููู ุงููููุชูููููููููููููุช ููููู ูููุจููู""", 0.5, 2.0)