Skip to content

Commit

Permalink
Merge pull request #8 from Ousret/upgrade-0.3
Browse files Browse the repository at this point in the history
Upgrade to 0.3
  • Loading branch information
Ousret authored Sep 12, 2019
2 parents 17924e4 + d5473af commit 6009bf8
Show file tree
Hide file tree
Showing 13 changed files with 466 additions and 191 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ This project offer you a alternative to **Universal Charset Encoding Detector**,

| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
| ------------- | :-------------: | :------------------: | :------------------: |
| `Fast` | ❌<br> 🐌🐌 | <br> | ✅ <br>⚡ |
| `Fast` | ❌<br> | <br> | ✅ <br>⚡ |
| `Universal**` ||||
| `Reliable` **without** distinguishable standards ||||
| `Reliable` **with** distinguishable standards ||||
Expand Down
1 change: 1 addition & 0 deletions charset_normalizer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
from charset_normalizer.unicode import UnicodeRangeIdentify
from charset_normalizer.probe_chaos import ProbeChaos
from charset_normalizer.probe_coherence import ProbeCoherence
from charset_normalizer.probe_words import ProbeWords
from charset_normalizer.legacy import detect
12 changes: 10 additions & 2 deletions charset_normalizer/cli/normalizer.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import argparse
import sys

from charset_normalizer import CharsetNormalizerMatches
from prettytable import PrettyTable

from charset_normalizer import CharsetNormalizerMatches


def query_yes_no(question, default="yes"):
"""Ask a yes/no question via input() and return their answer.
Expand Down Expand Up @@ -56,6 +57,8 @@ def cli_detect(argv=None):
help='Replace file when trying to normalize it instead of creating a new one.')
parser.add_argument('--force', action="store_true", default=False, dest='force',
help='Replace file without asking if you are sure, use this flag with caution.')
parser.add_argument('--threshold', action="store", default=0.2, type=float, dest='threshold',
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.")

args = parser.parse_args(argv)

Expand All @@ -72,10 +75,15 @@ def cli_detect(argv=None):
print('Use --force in addition of --replace only.', file=sys.stderr)
return 1

if args.threshold < 0. or args.threshold > 1.:
print('--threshold VALUE should be between 0. AND 1.')
return 1

for my_file in args.file:

matches = CharsetNormalizerMatches.from_fp(
my_file
my_file,
threshold=args.threshold
)

if len(matches) == 0:
Expand Down
18 changes: 18 additions & 0 deletions charset_normalizer/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,24 @@
"Variation Selectors Supplement"
]

UNICODE_SECONDARY_RANGE_KEYWORD = [
'Supplement',
'Extended',
'Extensions',
'Modifier',
'Marks',
'Punctuation',
'Symbols',
'Forms',
'Operators',
'Miscellaneous',
'Drawing',
'Block',
'Shapes',
'Supplemental',
'Tags'
]

BYTE_ORDER_MARK = {
'utf_8': BOM_UTF8,
'utf_7': [
Expand Down
124 changes: 56 additions & 68 deletions charset_normalizer/normalizer.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
# coding: utf-8
import collections
import re
import statistics
from encodings.aliases import aliases
from os.path import basename, splitext
import collections
from platform import python_version_tuple

from cached_property import cached_property

from charset_normalizer.probe_coherence import ProbeCoherence, HashableCounter
from charset_normalizer.probe_chaos import ProbeChaos
from charset_normalizer.constant import BYTE_ORDER_MARK

from platform import python_version_tuple
from charset_normalizer.probe_chaos import ProbeChaos
from charset_normalizer.probe_coherence import ProbeCoherence, HashableCounter


class CharsetNormalizerMatch:
Expand Down Expand Up @@ -93,8 +92,13 @@ def language(self):
:return: Most used/probable language in text
:rtype: str
"""
languages = ProbeCoherence(self.char_counter).most_likely
return languages[0] if len(languages) > 0 else ('English' if len(self.alphabets) == 1 and self.alphabets[0] == 'Basic Latin' else 'Unknown')
probe_coherence = ProbeCoherence(self.char_counter)
languages = probe_coherence.most_likely

if len(languages) == 0:
return 'English' if len(self.alphabets) == 1 and self.alphabets[0] == 'Basic Latin' else 'Unknown'

return languages[0]

@cached_property
def chaos(self):
Expand Down Expand Up @@ -194,7 +198,7 @@ def __len__(self):
return len(self._matches)

@staticmethod
def normalize(path, steps=10, chunk_size=512, threshold=0.09):
def normalize(path, steps=10, chunk_size=512, threshold=0.20):
"""
:param str path:
:param int steps:
Expand Down Expand Up @@ -226,7 +230,7 @@ def normalize(path, steps=10, chunk_size=512, threshold=0.09):
return b_

@staticmethod
def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.09):
def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20):
"""
Take a sequence of bytes that could potentially be decoded to str and discard all obvious non supported
charset encoding.
Expand All @@ -244,7 +248,7 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.09):
supported = sorted(aliases.items()) if py_need_sort else aliases.items()

tested = set()
working = dict()
matches = list()

maximum_length = len(sequences)

Expand Down Expand Up @@ -286,70 +290,54 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.09):
except LookupError:
continue

chaos_measures = list()
ranges_encountered_t = dict()
decoded_len_t = 0

successive_chaos_zero = 0
r_ = range(
0 if bom_available is False else bom_len,
maximum_length,
int(maximum_length / steps)
)
p_ = len(r_)

for i in r_:

chunk = sequences[i:i + chunk_size]
decoded = str(chunk, encoding=p, errors='ignore')

probe_chaos = ProbeChaos(decoded, giveup_threshold=threshold)
chaos_measure, ranges_encountered = probe_chaos.ratio, probe_chaos.encountered_unicode_range_occurrences

for k, e in ranges_encountered.items():
if k not in ranges_encountered_t.keys():
ranges_encountered_t[k] = 0
ranges_encountered_t[k] += e

if bom_available is True:
if chaos_measure > 0.:
chaos_measure /= 2
else:
chaos_measure = -1.

if chaos_measure > threshold:
if p in working.keys():
del working[p]
break
elif chaos_measure == 0.:
successive_chaos_zero += 1
if steps > 2 and successive_chaos_zero > p_ / 2:
break
elif chaos_measure > 0. and successive_chaos_zero > 0:
successive_chaos_zero = 0

chaos_measures.append(chaos_measure)

if p not in working.keys():
working[p] = dict()

if p in working.keys():
working[p]['ratio'] = statistics.mean(chaos_measures)
working[p]['ranges'] = ranges_encountered_t
working[p]['chaos'] = sum(chaos_measures)
working[p]['len'] = decoded_len_t
working[p]['bom'] = bom_available
working[p]['bom_len'] = bom_len

if p == 'ascii' and p in working.keys() and working[p]['ratio'] == 0.:
break

return CharsetNormalizerMatches(
[CharsetNormalizerMatch(sequences if working[enc]['bom'] is False else sequences[working[enc]['bom_len']:], enc, working[enc]['ratio'], working[enc]['ranges'], working[enc]['bom']) for enc in
(sorted(working.keys()) if py_need_sort else working.keys()) if working[enc]['ratio'] <= threshold])

measures = [ProbeChaos(str(sequences[i:i + chunk_size], encoding=p, errors='ignore'), giveup_threshold=threshold) for i in r_]
ratios = [el.ratio for el in measures]
nb_gave_up = [el.gave_up is True or el.ratio >= threshold for el in measures].count(True)

chaos_means = statistics.mean(ratios)
chaos_median = statistics.median(ratios)
chaos_min = min(ratios)
chaos_max = max(ratios)

if (len(r_) >= 4 and nb_gave_up > len(r_) / 4) or chaos_median > threshold:
# print(p, 'is too much chaos for decoded input !')
continue

encountered_unicode_range_occurrences = dict()

for el in measures:
for u_name, u_occ in el.encountered_unicode_range_occurrences.items():
if u_name not in encountered_unicode_range_occurrences.keys():
encountered_unicode_range_occurrences[u_name] = 0
encountered_unicode_range_occurrences[u_name] += u_occ

# print(p, 'U RANGES', encountered_unicode_range_occurrences)

matches.append(
CharsetNormalizerMatch(
sequences if not bom_available else sequences[bom_len:],
p,
chaos_means,
encountered_unicode_range_occurrences,
bom_available
)
)

# print(p, nb_gave_up, chaos_means, chaos_median, chaos_min, chaos_max, matches[-1].coherence, matches[-1].language)

if (p == 'ascii' and chaos_median == 0.) or bom_available is True:
return CharsetNormalizerMatches([matches[-1]])

return CharsetNormalizerMatches(matches)

@staticmethod
def from_fp(fp, steps=10, chunk_size=512, threshold=0.09):
def from_fp(fp, steps=10, chunk_size=512, threshold=0.20):
"""
:param io.BinaryIO fp:
:param int steps:
Expand All @@ -365,7 +353,7 @@ def from_fp(fp, steps=10, chunk_size=512, threshold=0.09):
)

@staticmethod
def from_path(path, steps=10, chunk_size=512, threshold=0.09):
def from_path(path, steps=10, chunk_size=512, threshold=0.20):
"""
:param str path:
:param int steps:
Expand Down
71 changes: 65 additions & 6 deletions charset_normalizer/probe_chaos.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
# coding: utf-8
import re
from functools import lru_cache

from dragonmapper.hanzi import MIXED, BOTH, UNKNOWN
from dragonmapper.hanzi import identify as s_identify
from zhon.hanzi import sentence as cjc_sentence_re

from charset_normalizer.probe_coherence import HashableCounter
from charset_normalizer.probe_words import ProbeWords
from charset_normalizer.unicode import UnicodeRangeIdentify

from functools import lru_cache


@lru_cache(maxsize=8192)
class ProbeChaos:
Expand Down Expand Up @@ -48,14 +49,62 @@ def __init__(self, string, giveup_threshold=0.09):
self.total_upper_accent_encountered_inner = 0
self.total_unaccented_letter_encountered = 0

self._probe_word = ProbeWords(HashableCounter(self._string.split()))

self.gave_up = False

if len(self._string) >= 10:
self._probe()

def __add__(self, other):
"""
:param ProbeChaos other:
:return:
"""
k_ = ProbeChaos('', self._threshold)

k_.successive_upper_lower = self.successive_upper_lower + other.successive_upper_lower
k_.successive_accent = self.successive_accent + other.successive_accent
k_.successive_different_unicode_range = self.successive_different_unicode_range + other.successive_different_unicode_range

for el in self.encountered_unicode_range:
k_.encountered_unicode_range.add(el)

for el in other.encountered_unicode_range:
k_.encountered_unicode_range.add(el)

k_.encountered_punc_sign = self.encountered_punc_sign + other.encountered_punc_sign
k_.unprintable = self.unprintable + other.unprintable
k_.encountered_white_space = self.encountered_white_space + other.encountered_white_space
k_.not_encountered_white_space = self.not_encountered_white_space + other.not_encountered_white_space

for u_name, u_occ in self.encountered_unicode_range_occurrences.items():
if u_name not in k_.encountered_unicode_range_occurrences.keys():
k_.encountered_unicode_range_occurrences[u_name] = 0
k_.encountered_unicode_range_occurrences[u_name] += u_occ

for u_name, u_occ in other.encountered_unicode_range_occurrences.items():
if u_name not in k_.encountered_unicode_range_occurrences.keys():
k_.encountered_unicode_range_occurrences[u_name] = 0
k_.encountered_unicode_range_occurrences[u_name] += u_occ

k_.not_encountered_white_space_reset = self.not_encountered_white_space_reset + other.not_encountered_white_space_reset
k_.total_letter_encountered = self.total_letter_encountered + other.total_letter_encountered
k_.total_lower_letter_encountered = self.total_lower_letter_encountered + other.total_lower_letter_encountered
k_.total_upper_accent_encountered = self.total_upper_accent_encountered + other.total_upper_accent_encountered
k_.total_upper_accent_encountered_inner = self.total_upper_accent_encountered_inner + other.total_upper_accent_encountered_inner
k_.total_unaccented_letter_encountered = self.total_unaccented_letter_encountered + other.total_unaccented_letter_encountered

k_._probe_word = self._probe_word + other._probe_word

k_._string = self._string + other._string

return k_

def _probe(self):

c__ = False
upper_lower_m = False

for c, i_ in zip(self._string, range(0, len(self._string))):

Expand Down Expand Up @@ -133,7 +182,13 @@ def _probe(self):
continue

if (is_lower and self.previous_printable_letter.isupper()) or (is_upper and self.previous_printable_letter.islower()):
self.successive_upper_lower += 1
if not upper_lower_m:
upper_lower_m = True
else:
self.successive_upper_lower += 1
upper_lower_m = False
else:
upper_lower_m = False

if is_latin:
self.previous_encountered_unicode_range = u_name
Expand All @@ -154,15 +209,19 @@ def _probe(self):

@staticmethod
def _unravel_cjk_suspicious_chinese(string, encountered_unicode_range_occurrences):
if len(string) <= 10:
return UNKNOWN

encountered_unicode_range = encountered_unicode_range_occurrences.keys()

if 'CJK Unified Ideographs' in encountered_unicode_range and ('Hiragana' not in encountered_unicode_range and 'Katakana' not in encountered_unicode_range):
i_ = s_identify(string)
if i_ in [MIXED, BOTH]:
return encountered_unicode_range_occurrences['CJK Unified Ideographs']
elif i_ != UNKNOWN and len(re.findall(cjc_sentence_re, string)) == 0:
return encountered_unicode_range_occurrences['CJK Unified Ideographs']
elif i_ != UNKNOWN and len(re.findall(cjc_sentence_re, string)) > 0:
return -encountered_unicode_range_occurrences['CJK Unified Ideographs']
elif i_ != UNKNOWN:
return int(encountered_unicode_range_occurrences['CJK Unified Ideographs']*0.3)

return UNKNOWN

Expand All @@ -178,4 +237,4 @@ def ratio(self):
r_ = self.total_upper_accent_encountered if self.total_letter_encountered > 0 and self.total_unaccented_letter_encountered / self.total_letter_encountered < 0.5 else 0
z_ = UnicodeRangeIdentify.unravel_suspicious_ranges(len(self._string), self.encountered_unicode_range_occurrences)
p_ = self.encountered_punc_sign if self.encountered_punc_sign / len(self._string) > 0.2 else 0
return (r_ + p_ + self.successive_upper_lower + self.successive_accent + self.successive_different_unicode_range + self.not_encountered_white_space + self.unprintable + z_ + ProbeChaos._unravel_cjk_suspicious_chinese.__func__(self._string, self.encountered_unicode_range_occurrences)) / len(self._string) # + len(self.encountered_unicode_range)-1
return ((r_ + p_ + self.successive_upper_lower + self.successive_accent + self.successive_different_unicode_range + self.not_encountered_white_space + self.unprintable + z_ + ProbeChaos._unravel_cjk_suspicious_chinese.__func__(self._string, self.encountered_unicode_range_occurrences)) / len(self._string)) + self._probe_word.ratio # + len(self.encountered_unicode_range)-1
Loading

0 comments on commit 6009bf8

Please sign in to comment.