🎨 Replace AST typing to the native syntax (3.6+) (#193)

jawah · Jun 18, 2022 · 58c93ff · 58c93ff
1 parent 6ac98eb
commit 58c93ff
Show file tree

Hide file tree

Showing 7 changed files with 183 additions and 189 deletions.
diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py
@@ -67,11 +67,11 @@ def from_bytes(
         )
 
     if explain:
-        previous_logger_level = logger.level  # type: int
+        previous_logger_level: int = logger.level
         logger.addHandler(explain_handler)
         logger.setLevel(TRACE)
 
-    length = len(sequences)  # type: int
+    length: int = len(sequences)
 
     if length == 0:
         logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
@@ -116,8 +116,8 @@ def from_bytes(
     if steps > 1 and length / steps < chunk_size:
         chunk_size = int(length / steps)
 
-    is_too_small_sequence = len(sequences) < TOO_SMALL_SEQUENCE  # type: bool
-    is_too_large_sequence = len(sequences) >= TOO_BIG_SEQUENCE  # type: bool
+    is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
+    is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
 
     if is_too_small_sequence:
         logger.log(
@@ -134,11 +134,11 @@ def from_bytes(
             ),
         )
 
-    prioritized_encodings = []  # type: List[str]
+    prioritized_encodings: List[str] = []
 
-    specified_encoding = (
+    specified_encoding: Optional[str] = (
         any_specified_encoding(sequences) if preemptive_behaviour else None
-    )  # type: Optional[str]
+    )
 
     if specified_encoding is not None:
         prioritized_encodings.append(specified_encoding)
@@ -148,15 +148,15 @@ def from_bytes(
             specified_encoding,
         )
 
-    tested = set()  # type: Set[str]
-    tested_but_hard_failure = []  # type: List[str]
-    tested_but_soft_failure = []  # type: List[str]
+    tested: Set[str] = set()
+    tested_but_hard_failure: List[str] = []
+    tested_but_soft_failure: List[str] = []
 
-    fallback_ascii = None  # type: Optional[CharsetMatch]
-    fallback_u8 = None  # type: Optional[CharsetMatch]
-    fallback_specified = None  # type: Optional[CharsetMatch]
+    fallback_ascii: Optional[CharsetMatch] = None
+    fallback_u8: Optional[CharsetMatch] = None
+    fallback_specified: Optional[CharsetMatch] = None
 
-    results = CharsetMatches()  # type: CharsetMatches
+    results: CharsetMatches = CharsetMatches()
 
     sig_encoding, sig_payload = identify_sig_or_bom(sequences)
 
@@ -187,11 +187,11 @@ def from_bytes(
 
         tested.add(encoding_iana)
 
-        decoded_payload = None  # type: Optional[str]
-        bom_or_sig_available = sig_encoding == encoding_iana  # type: bool
-        strip_sig_or_bom = bom_or_sig_available and should_strip_sig_or_bom(
+        decoded_payload: Optional[str] = None
+        bom_or_sig_available: bool = sig_encoding == encoding_iana
+        strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
             encoding_iana
-        )  # type: bool
+        )
 
         if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
             logger.log(
@@ -202,7 +202,7 @@ def from_bytes(
             continue
 
         try:
-            is_multi_byte_decoder = is_multi_byte_encoding(encoding_iana)  # type: bool
+            is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
         except (ModuleNotFoundError, ImportError):
             logger.log(
                 TRACE,
@@ -237,7 +237,7 @@ def from_bytes(
             tested_but_hard_failure.append(encoding_iana)
             continue
 
-        similar_soft_failure_test = False  # type: bool
+        similar_soft_failure_test: bool = False
 
         for encoding_soft_failed in tested_but_soft_failure:
             if is_cp_similar(encoding_iana, encoding_soft_failed):
@@ -259,11 +259,11 @@ def from_bytes(
             int(length / steps),
         )
 
-        multi_byte_bonus = (
+        multi_byte_bonus: bool = (
             is_multi_byte_decoder
             and decoded_payload is not None
             and len(decoded_payload) < length
-        )  # type: bool
+        )
 
         if multi_byte_bonus:
             logger.log(
@@ -273,13 +273,13 @@ def from_bytes(
                 encoding_iana,
             )
 
-        max_chunk_gave_up = int(len(r_) / 4)  # type: int
+        max_chunk_gave_up: int = int(len(r_) / 4)
 
         max_chunk_gave_up = max(max_chunk_gave_up, 2)
-        early_stop_count = 0  # type: int
+        early_stop_count: int = 0
         lazy_str_hard_failure = False
 
-        md_chunks = []  # type: List[str]
+        md_chunks: List[str] = []
         md_ratios = []
 
         try:
@@ -334,9 +334,7 @@ def from_bytes(
                 tested_but_hard_failure.append(encoding_iana)
                 continue
 
-        mean_mess_ratio = (
-            sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
-        )  # type: float
+        mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
         if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
             tested_but_soft_failure.append(encoding_iana)
             logger.log(
@@ -371,7 +369,7 @@ def from_bytes(
         )
 
         if not is_multi_byte_decoder:
-            target_languages = encoding_languages(encoding_iana)  # type: List[str]
+            target_languages: List[str] = encoding_languages(encoding_iana)
         else:
             target_languages = mb_encoding_languages(encoding_iana)
 

diff --git a/charset_normalizer/cd.py b/charset_normalizer/cd.py
@@ -26,15 +26,15 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
 
     decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder  # type: ignore
 
-    p = decoder(errors="ignore")  # type: IncrementalDecoder
-    seen_ranges = {}  # type: Dict[str, int]
-    character_count = 0  # type: int
+    p: IncrementalDecoder = decoder(errors="ignore")
+    seen_ranges: Dict[str, int] = {}
+    character_count: int = 0
 
     for i in range(0x40, 0xFF):
-        chunk = p.decode(bytes([i]))  # type: str
+        chunk: str = p.decode(bytes([i]))
 
         if chunk:
-            character_range = unicode_range(chunk)  # type: Optional[str]
+            character_range: Optional[str] = unicode_range(chunk)
 
             if character_range is None:
                 continue
@@ -58,7 +58,7 @@ def unicode_range_languages(primary_range: str) -> List[str]:
     """
     Return inferred languages used with a unicode range.
     """
-    languages = []  # type: List[str]
+    languages: List[str] = []
 
     for language, characters in FREQUENCIES.items():
         for character in characters:
@@ -75,8 +75,8 @@ def encoding_languages(iana_name: str) -> List[str]:
     Single-byte encoding language association. Some code page are heavily linked to particular language(s).
     This function does the correspondence.
     """
-    unicode_ranges = encoding_unicode_range(iana_name)  # type: List[str]
-    primary_range = None  # type: Optional[str]
+    unicode_ranges: List[str] = encoding_unicode_range(iana_name)
+    primary_range: Optional[str] = None
 
     for specified_range in unicode_ranges:
         if "Latin" not in specified_range:
@@ -115,8 +115,8 @@ def get_target_features(language: str) -> Tuple[bool, bool]:
     """
     Determine main aspects from a supported language if it contains accents and if is pure Latin.
     """
-    target_have_accents = False  # type: bool
-    target_pure_latin = True  # type: bool
+    target_have_accents: bool = False
+    target_pure_latin: bool = True
 
     for character in FREQUENCIES[language]:
         if not target_have_accents and is_accentuated(character):
@@ -133,7 +133,7 @@ def alphabet_languages(
     """
     Return associated languages associated to given characters.
     """
-    languages = []  # type: List[Tuple[str, float]]
+    languages: List[Tuple[str, float]] = []
 
     source_have_accents = any(is_accentuated(character) for character in characters)
 
@@ -147,13 +147,13 @@ def alphabet_languages(
         if target_have_accents is False and source_have_accents:
             continue
 
-        character_count = len(language_characters)  # type: int
+        character_count: int = len(language_characters)
 
-        character_match_count = len(
+        character_match_count: int = len(
             [c for c in language_characters if c in characters]
-        )  # type: int
+        )
 
-        ratio = character_match_count / character_count  # type: float
+        ratio: float = character_match_count / character_count
 
         if ratio >= 0.2:
             languages.append((language, ratio))
@@ -174,33 +174,33 @@ def characters_popularity_compare(
     if language not in FREQUENCIES:
         raise ValueError("{} not available".format(language))
 
-    character_approved_count = 0  # type: int
+    character_approved_count: int = 0
     FREQUENCIES_language_set = set(FREQUENCIES[language])
 
     for character in ordered_characters:
         if character not in FREQUENCIES_language_set:
             continue
 
-        characters_before_source = FREQUENCIES[language][
+        characters_before_source: List[str] = FREQUENCIES[language][
             0 : FREQUENCIES[language].index(character)
-        ]  # type: List[str]
-        characters_after_source = FREQUENCIES[language][
+        ]
+        characters_after_source: List[str] = FREQUENCIES[language][
             FREQUENCIES[language].index(character) :
-        ]  # type: List[str]
-        characters_before = ordered_characters[
+        ]
+        characters_before: List[str] = ordered_characters[
             0 : ordered_characters.index(character)
-        ]  # type: List[str]
-        characters_after = ordered_characters[
+        ]
+        characters_after: List[str] = ordered_characters[
             ordered_characters.index(character) :
-        ]  # type: List[str]
+        ]
 
-        before_match_count = len(
+        before_match_count: int = len(
             set(characters_before) & set(characters_before_source)
-        )  # type: int
+        )
 
-        after_match_count = len(
+        after_match_count: int = len(
             set(characters_after) & set(characters_after_source)
-        )  # type: int
+        )
 
         if len(characters_before_source) == 0 and before_match_count <= 4:
             character_approved_count += 1
@@ -232,12 +232,12 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
         if character.isalpha() is False:
             continue
 
-        character_range = unicode_range(character)  # type: Optional[str]
+        character_range: Optional[str] = unicode_range(character)
 
         if character_range is None:
             continue
 
-        layer_target_range = None  # type: Optional[str]
+        layer_target_range: Optional[str] = None
 
         for discovered_range in layers:
             if (
@@ -296,33 +296,33 @@ def coherence_ratio(
     A layer = Character extraction by alphabets/ranges.
     """
 
-    results = []  # type: List[Tuple[str, float]]
-    ignore_non_latin = False  # type: bool
+    results: List[Tuple[str, float]] = []
+    ignore_non_latin: bool = False
 
-    sufficient_match_count = 0  # type: int
+    sufficient_match_count: int = 0
 
     lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
     if "Latin Based" in lg_inclusion_list:
         ignore_non_latin = True
         lg_inclusion_list.remove("Latin Based")
 
     for layer in alpha_unicode_split(decoded_sequence):
-        sequence_frequencies = Counter(layer)  # type: Counter
+        sequence_frequencies: Counter = Counter(layer)
         most_common = sequence_frequencies.most_common()
 
-        character_count = sum(o for c, o in most_common)  # type: int
+        character_count: int = sum(o for c, o in most_common)
 
         if character_count <= TOO_SMALL_SEQUENCE:
             continue
 
-        popular_character_ordered = [c for c, o in most_common]  # type: List[str]
+        popular_character_ordered: List[str] = [c for c, o in most_common]
 
         for language in lg_inclusion_list or alphabet_languages(
             popular_character_ordered, ignore_non_latin
         ):
-            ratio = characters_popularity_compare(
+            ratio: float = characters_popularity_compare(
                 language, popular_character_ordered
-            )  # type: float
+            )
 
             if ratio < threshold:
                 continue

diff --git a/charset_normalizer/cli/normalizer.py b/charset_normalizer/cli/normalizer.py
@@ -229,7 +229,7 @@ def cli_detect(argv: List[str] = None) -> int:
                         my_file.close()
                     continue
 
-                o_ = my_file.name.split(".")  # type: List[str]
+                o_: List[str] = my_file.name.split(".")
 
                 if args.replace is False:
                     o_.insert(-1, best_guess.encoding)