Adapt chunk_size and steps according to sequence len (#15)

* Adapt chunk_size and steps according to sequence len * Add pragma no coverage for frequencies json generator * Add test for CLI query_yes_no and verbose output * Reset path level for unittest * bump 1.1.1
jawah · Sep 23, 2019 · 5abfb83 · 5abfb83
1 parent 38c77fa
commit 5abfb83
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 12 deletions.
diff --git a/charset_normalizer/normalizer.py b/charset_normalizer/normalizer.py
@@ -5,6 +5,7 @@
 from encodings.aliases import aliases
 from os.path import basename, splitext
 from platform import python_version_tuple
+from warnings import warn
 
 from cached_property import cached_property
 
@@ -254,6 +255,7 @@ def __len__(self):
     @staticmethod
     def normalize(path, steps=10, chunk_size=512, threshold=0.20):
         """
+
         :param str path:
         :param int steps:
         :param int chunk_size:
@@ -296,19 +298,31 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20):
         :return: List of potential matches
         :rtype: CharsetNormalizerMatches
         """
-        py_v = [int(el) for el in python_version_tuple()]
-        py_need_sort = py_v[0] < 3 or (py_v[0] == 3 and py_v[1] < 6)
 
-        supported = sorted(aliases.items()) if py_need_sort else aliases.items()
+        too_small_sequence = len(sequences) < 24
 
-        tested = set()
-        matches = list()
+        if too_small_sequence is True:
+            warn('Trying to detect encoding from a tiny portion of ({}) bytes.'.format(len(sequences)))
 
         maximum_length = len(sequences)
 
+        # Adjust steps and chunk_size when content is just too small for it
+        if maximum_length <= (chunk_size * steps):
+            steps = 1
+
         if maximum_length <= chunk_size:
             chunk_size = maximum_length
-            steps = 1
+        elif steps > 1 and maximum_length / steps < chunk_size:
+            chunk_size = int(maximum_length / steps)
+
+        # Bellow Python 3.6, Expect dict to not behave the same.
+        py_v = [int(el) for el in python_version_tuple()]
+        py_need_sort = py_v[0] < 3 or (py_v[0] == 3 and py_v[1] < 6)
+
+        supported = collections.OrderedDict(aliases).items() if py_need_sort else aliases.items()
+
+        tested = set()
+        matches = list()
 
         for support in supported:
 
@@ -360,7 +374,7 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20):
             # chaos_max = max(ratios)
 
             if (len(r_) >= 4 and nb_gave_up > len(r_) / 4) or chaos_median > threshold:
-                # print(p, 'is too much chaos for decoded input !')
+                # print(p, 'is too much chaos for decoded input !', nb_gave_up, chaos_median)
                 continue
 
             encountered_unicode_range_occurrences = dict()
@@ -396,7 +410,7 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20):
                     )
                 )
 
-            # print(p, nb_gave_up, chaos_means, chaos_median, chaos_min, chaos_max, matches[-1].coherence, matches[-1].languages,)
+            # print(p, nb_gave_up, chaos_means, chaos_median, matches[-1].coherence, matches[-1].languages,)
 
             if (p == 'ascii' and chaos_median == 0.) or bom_available is True:
                 return CharsetNormalizerMatches([matches[-1]])

diff --git a/charset_normalizer/probe_coherence.py b/charset_normalizer/probe_coherence.py
@@ -231,7 +231,7 @@ def _verify_order_on(target_alphabet_ordered, character_occurrences, distance_ma
         return n_not_rightfully_ranked / n_tested, n_tested, n_tested_verified
 
     @staticmethod
-    def frequencies_json(minimum_char_count=45000000, save_to_file=True, proxies=None):
+    def frequencies_json(minimum_char_count=45000000, save_to_file=True, proxies=None):  # pragma: no cover
         """
         This method refresh or create frequencies.json at will.
         Don't abuse it as it perform HTTP GET query
@@ -291,4 +291,3 @@ def frequencies_json(minimum_char_count=45000000, save_to_file=True, proxies=Non
             with open('{}/frequencies.json'.format(ProbeCoherence.ASSETS_PATH) if exists('{}/frequencies.json'.format(
                     ProbeCoherence.ASSETS_PATH)) else './charset_normalizer/assets/frequencies.json', 'w', encoding='utf-8') as fp:
                 json.dump(ProbeCoherence.FREQUENCIES, fp)
-
diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@
 EMAIL = 'ahmed.tahri@cloudnursery.dev'
 AUTHOR = 'Ahmed TAHRI @Ousret'
 REQUIRES_PYTHON = '>=3.5.0'
-VERSION = '1.1.0'
+VERSION = '1.1.1'
 
 REQUIRED = [
     'cached_property',

diff --git a/test/test_cli.py b/test/test_cli.py
@@ -1,9 +1,22 @@
 import unittest
-from charset_normalizer.cli.normalizer import cli_detect
+from charset_normalizer.cli.normalizer import cli_detect, query_yes_no
+from unittest.mock import patch
 
 
 class TestCommandLineInterface(unittest.TestCase):
 
+    @patch('builtins.input', lambda *args: 'y')
+    def test_simple_yes_input(self):
+        self.assertTrue(
+            query_yes_no('Are u willing to chill a little bit ?')
+        )
+
+    @patch('builtins.input', lambda *args: 'N')
+    def test_simple_no_input(self):
+        self.assertFalse(
+            query_yes_no('Are u willing to chill a little bit ?')
+        )
+
     def test_single_file(self):
 
         self.assertEqual(
@@ -13,6 +26,14 @@ def test_single_file(self):
             )
         )
 
+    def test_single_verbose_file(self):
+        self.assertEqual(
+            0,
+            cli_detect(
+                ['./data/sample.1.ar.srt', '--verbose']
+            )
+        )
+
     def test_multiple_file(self):
         self.assertEqual(
             0,