Skip to content

Commit

Permalink
🎨 Replace AST typing to the native syntax (3.6+) (#193)
Browse files Browse the repository at this point in the history
  • Loading branch information
Ousret authored Jun 18, 2022
1 parent 6ac98eb commit 58c93ff
Show file tree
Hide file tree
Showing 7 changed files with 183 additions and 189 deletions.
56 changes: 27 additions & 29 deletions charset_normalizer/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,11 @@ def from_bytes(
)

if explain:
previous_logger_level = logger.level # type: int
previous_logger_level: int = logger.level
logger.addHandler(explain_handler)
logger.setLevel(TRACE)

length = len(sequences) # type: int
length: int = len(sequences)

if length == 0:
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
Expand Down Expand Up @@ -116,8 +116,8 @@ def from_bytes(
if steps > 1 and length / steps < chunk_size:
chunk_size = int(length / steps)

is_too_small_sequence = len(sequences) < TOO_SMALL_SEQUENCE # type: bool
is_too_large_sequence = len(sequences) >= TOO_BIG_SEQUENCE # type: bool
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE

if is_too_small_sequence:
logger.log(
Expand All @@ -134,11 +134,11 @@ def from_bytes(
),
)

prioritized_encodings = [] # type: List[str]
prioritized_encodings: List[str] = []

specified_encoding = (
specified_encoding: Optional[str] = (
any_specified_encoding(sequences) if preemptive_behaviour else None
) # type: Optional[str]
)

if specified_encoding is not None:
prioritized_encodings.append(specified_encoding)
Expand All @@ -148,15 +148,15 @@ def from_bytes(
specified_encoding,
)

tested = set() # type: Set[str]
tested_but_hard_failure = [] # type: List[str]
tested_but_soft_failure = [] # type: List[str]
tested: Set[str] = set()
tested_but_hard_failure: List[str] = []
tested_but_soft_failure: List[str] = []

fallback_ascii = None # type: Optional[CharsetMatch]
fallback_u8 = None # type: Optional[CharsetMatch]
fallback_specified = None # type: Optional[CharsetMatch]
fallback_ascii: Optional[CharsetMatch] = None
fallback_u8: Optional[CharsetMatch] = None
fallback_specified: Optional[CharsetMatch] = None

results = CharsetMatches() # type: CharsetMatches
results: CharsetMatches = CharsetMatches()

sig_encoding, sig_payload = identify_sig_or_bom(sequences)

Expand Down Expand Up @@ -187,11 +187,11 @@ def from_bytes(

tested.add(encoding_iana)

decoded_payload = None # type: Optional[str]
bom_or_sig_available = sig_encoding == encoding_iana # type: bool
strip_sig_or_bom = bom_or_sig_available and should_strip_sig_or_bom(
decoded_payload: Optional[str] = None
bom_or_sig_available: bool = sig_encoding == encoding_iana
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
encoding_iana
) # type: bool
)

if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
logger.log(
Expand All @@ -202,7 +202,7 @@ def from_bytes(
continue

try:
is_multi_byte_decoder = is_multi_byte_encoding(encoding_iana) # type: bool
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
except (ModuleNotFoundError, ImportError):
logger.log(
TRACE,
Expand Down Expand Up @@ -237,7 +237,7 @@ def from_bytes(
tested_but_hard_failure.append(encoding_iana)
continue

similar_soft_failure_test = False # type: bool
similar_soft_failure_test: bool = False

for encoding_soft_failed in tested_but_soft_failure:
if is_cp_similar(encoding_iana, encoding_soft_failed):
Expand All @@ -259,11 +259,11 @@ def from_bytes(
int(length / steps),
)

multi_byte_bonus = (
multi_byte_bonus: bool = (
is_multi_byte_decoder
and decoded_payload is not None
and len(decoded_payload) < length
) # type: bool
)

if multi_byte_bonus:
logger.log(
Expand All @@ -273,13 +273,13 @@ def from_bytes(
encoding_iana,
)

max_chunk_gave_up = int(len(r_) / 4) # type: int
max_chunk_gave_up: int = int(len(r_) / 4)

max_chunk_gave_up = max(max_chunk_gave_up, 2)
early_stop_count = 0 # type: int
early_stop_count: int = 0
lazy_str_hard_failure = False

md_chunks = [] # type: List[str]
md_chunks: List[str] = []
md_ratios = []

try:
Expand Down Expand Up @@ -334,9 +334,7 @@ def from_bytes(
tested_but_hard_failure.append(encoding_iana)
continue

mean_mess_ratio = (
sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
) # type: float
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
tested_but_soft_failure.append(encoding_iana)
logger.log(
Expand Down Expand Up @@ -371,7 +369,7 @@ def from_bytes(
)

if not is_multi_byte_decoder:
target_languages = encoding_languages(encoding_iana) # type: List[str]
target_languages: List[str] = encoding_languages(encoding_iana)
else:
target_languages = mb_encoding_languages(encoding_iana)

Expand Down
76 changes: 38 additions & 38 deletions charset_normalizer/cd.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@ def encoding_unicode_range(iana_name: str) -> List[str]:

decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder # type: ignore

p = decoder(errors="ignore") # type: IncrementalDecoder
seen_ranges = {} # type: Dict[str, int]
character_count = 0 # type: int
p: IncrementalDecoder = decoder(errors="ignore")
seen_ranges: Dict[str, int] = {}
character_count: int = 0

for i in range(0x40, 0xFF):
chunk = p.decode(bytes([i])) # type: str
chunk: str = p.decode(bytes([i]))

if chunk:
character_range = unicode_range(chunk) # type: Optional[str]
character_range: Optional[str] = unicode_range(chunk)

if character_range is None:
continue
Expand All @@ -58,7 +58,7 @@ def unicode_range_languages(primary_range: str) -> List[str]:
"""
Return inferred languages used with a unicode range.
"""
languages = [] # type: List[str]
languages: List[str] = []

for language, characters in FREQUENCIES.items():
for character in characters:
Expand All @@ -75,8 +75,8 @@ def encoding_languages(iana_name: str) -> List[str]:
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
unicode_ranges = encoding_unicode_range(iana_name) # type: List[str]
primary_range = None # type: Optional[str]
unicode_ranges: List[str] = encoding_unicode_range(iana_name)
primary_range: Optional[str] = None

for specified_range in unicode_ranges:
if "Latin" not in specified_range:
Expand Down Expand Up @@ -115,8 +115,8 @@ def get_target_features(language: str) -> Tuple[bool, bool]:
"""
Determine main aspects from a supported language if it contains accents and if is pure Latin.
"""
target_have_accents = False # type: bool
target_pure_latin = True # type: bool
target_have_accents: bool = False
target_pure_latin: bool = True

for character in FREQUENCIES[language]:
if not target_have_accents and is_accentuated(character):
Expand All @@ -133,7 +133,7 @@ def alphabet_languages(
"""
Return associated languages associated to given characters.
"""
languages = [] # type: List[Tuple[str, float]]
languages: List[Tuple[str, float]] = []

source_have_accents = any(is_accentuated(character) for character in characters)

Expand All @@ -147,13 +147,13 @@ def alphabet_languages(
if target_have_accents is False and source_have_accents:
continue

character_count = len(language_characters) # type: int
character_count: int = len(language_characters)

character_match_count = len(
character_match_count: int = len(
[c for c in language_characters if c in characters]
) # type: int
)

ratio = character_match_count / character_count # type: float
ratio: float = character_match_count / character_count

if ratio >= 0.2:
languages.append((language, ratio))
Expand All @@ -174,33 +174,33 @@ def characters_popularity_compare(
if language not in FREQUENCIES:
raise ValueError("{} not available".format(language))

character_approved_count = 0 # type: int
character_approved_count: int = 0
FREQUENCIES_language_set = set(FREQUENCIES[language])

for character in ordered_characters:
if character not in FREQUENCIES_language_set:
continue

characters_before_source = FREQUENCIES[language][
characters_before_source: List[str] = FREQUENCIES[language][
0 : FREQUENCIES[language].index(character)
] # type: List[str]
characters_after_source = FREQUENCIES[language][
]
characters_after_source: List[str] = FREQUENCIES[language][
FREQUENCIES[language].index(character) :
] # type: List[str]
characters_before = ordered_characters[
]
characters_before: List[str] = ordered_characters[
0 : ordered_characters.index(character)
] # type: List[str]
characters_after = ordered_characters[
]
characters_after: List[str] = ordered_characters[
ordered_characters.index(character) :
] # type: List[str]
]

before_match_count = len(
before_match_count: int = len(
set(characters_before) & set(characters_before_source)
) # type: int
)

after_match_count = len(
after_match_count: int = len(
set(characters_after) & set(characters_after_source)
) # type: int
)

if len(characters_before_source) == 0 and before_match_count <= 4:
character_approved_count += 1
Expand Down Expand Up @@ -232,12 +232,12 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
if character.isalpha() is False:
continue

character_range = unicode_range(character) # type: Optional[str]
character_range: Optional[str] = unicode_range(character)

if character_range is None:
continue

layer_target_range = None # type: Optional[str]
layer_target_range: Optional[str] = None

for discovered_range in layers:
if (
Expand Down Expand Up @@ -296,33 +296,33 @@ def coherence_ratio(
A layer = Character extraction by alphabets/ranges.
"""

results = [] # type: List[Tuple[str, float]]
ignore_non_latin = False # type: bool
results: List[Tuple[str, float]] = []
ignore_non_latin: bool = False

sufficient_match_count = 0 # type: int
sufficient_match_count: int = 0

lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
if "Latin Based" in lg_inclusion_list:
ignore_non_latin = True
lg_inclusion_list.remove("Latin Based")

for layer in alpha_unicode_split(decoded_sequence):
sequence_frequencies = Counter(layer) # type: Counter
sequence_frequencies: Counter = Counter(layer)
most_common = sequence_frequencies.most_common()

character_count = sum(o for c, o in most_common) # type: int
character_count: int = sum(o for c, o in most_common)

if character_count <= TOO_SMALL_SEQUENCE:
continue

popular_character_ordered = [c for c, o in most_common] # type: List[str]
popular_character_ordered: List[str] = [c for c, o in most_common]

for language in lg_inclusion_list or alphabet_languages(
popular_character_ordered, ignore_non_latin
):
ratio = characters_popularity_compare(
ratio: float = characters_popularity_compare(
language, popular_character_ordered
) # type: float
)

if ratio < threshold:
continue
Expand Down
2 changes: 1 addition & 1 deletion charset_normalizer/cli/normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def cli_detect(argv: List[str] = None) -> int:
my_file.close()
continue

o_ = my_file.name.split(".") # type: List[str]
o_: List[str] = my_file.name.split(".")

if args.replace is False:
o_.insert(-1, best_guess.encoding)
Expand Down
Loading

0 comments on commit 58c93ff

Please sign in to comment.