Merge pull request #71 from jquast/resolve-unicodedecodeerror-issue-65

Detect target file encoding in autodetect.py
prospector-dev · Dec 5, 2014 · f7012a9 · f7012a9
2 parents 83e8082 + 81d4624
commit f7012a9
Show file tree

Hide file tree

Showing 4 changed files with 203 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ Prospector Changelog
 * File names ending in 'tests.py' will now be ignored if prospector is set to ignore tests (previously, the regular expression only ignored files ending in 'test.py')
 * [#70](https://github.com/landscapeio/prospector/issues/70) Profiles starting with a `.yml` extension can now be autoloaded
 * [#62](https://github.com/landscapeio/prospector/issues/62) For human readable output, the summary of messages will now be printed at the end rather than at the start, so the summary will be what users see when running prospector (without piping into `less` etc)
+* [#65](https://github.com/landscapeio/prospector/issues/65) Resolve UnicodeDecodeErrors thrown while attempting to auto-discover modules of interest by discovering target python source file encoding (PEP263), and issuing only a warning if it fails (thanks to [Jeff Quast](https://github.com/jquast)).
 
 ## Version 0.7.2
 

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -3,4 +3,6 @@ Contributors
 
 * Carl Crowder ([@carlio](https://github.com/carlio))
 
-* Jason Simeone ([@jayclassless](https://github.com/jayclassless))
+* Jason Simeone ([@jayclassless](https://github.com/jayclassless))
+
+* Jeff Quast ([@jquast](https://github.com/jquast))
diff --git a/prospector/autodetect.py b/prospector/autodetect.py
@@ -1,3 +1,5 @@
+import codecs
+import warnings
 import os
 import re
 from prospector.adaptor import LIBRARY_ADAPTORS
@@ -9,6 +11,52 @@
 _FROM_IMPORT_REGEX = re.compile(r'^\s*from ([\._a-zA-Z0-9]+) import .*$')
 _IMPORT_REGEX = re.compile(r'^\s*import ([\._a-zA-Z0-9]+)$')
 _IMPORT_MULTIPLE_REGEX = re.compile(r'^\s*import ([\._a-zA-Z0-9]+(, ){1})+')
+_CODING_REGEX = "coding[:=]\s*([-\w.]+)"
+
+
+def detect_by_bom(path):
+    """
+    Determine and return file encoding using BOM character.
+
+    This is necessary for files such as setuptools' tests/script-with-bom.py
+
+    :returns bom-detected encoding only if discovered
+    """
+    with open(path, 'rb') as fin:
+        raw = fin.read(4)
+    for enc, boms in (
+            ('utf-8-sig', (codecs.BOM_UTF8,)),
+            ('utf-16', (codecs.BOM_UTF16_LE,codecs.BOM_UTF16_BE)),
+            ('utf-32', (codecs.BOM_UTF32_LE,codecs.BOM_UTF32_BE))):
+        if any(raw.startswith(bom) for bom in boms):
+            return enc
+
+
+def determine_pyfile_encoding(path, default='utf8'):
+    """
+    Determine and return file encoding of a python source file.
+
+    https://www.python.org/dev/peps/pep-0263/
+
+    :returns: file encoding if determined, otherwise ``default``.
+    """
+    encoding = detect_by_bom(path)
+    if encoding:
+        # trust the byte-order mark
+        return encoding
+
+    with open(path, 'rb') as fip:
+        # look for string of form '# coding: <encoding>'
+        # in the first two lines of python source file.
+        for line in (_line for _line in
+                     (fip.readline(), fip.readline())
+                     if _line.decode(default).startswith('#')):
+            match = next(re.finditer(_CODING_REGEX, line.decode('ascii')), None)
+            if match:
+                return match.group(1)
+        return default
+
+
 def find_from_imports(file_contents):
     names = set()
     for line in file_contents.split('\n'):
@@ -41,8 +89,21 @@ def find_from_path(path):
         if os.path.isdir(item_path):
             names |= find_from_path(item_path)
         elif not os.path.islink(item_path) and item_path.endswith('.py'):
-            with open(item_path) as fip:
-                names |= find_from_imports(fip.read())
+            try:
+                encoding = determine_pyfile_encoding(item_path, default='utf8')
+                with codecs.open(item_path, encoding=encoding) as fip:
+                        names |= find_from_imports(fip.read())
+            except UnicodeDecodeError as err:
+                # this warning is issued: (1) in determine_pyfile_encoding for
+                # badly authored files (contains non-utf8 in a comment line), or
+                # in find_from_imports() for either (2) a coding is specified,
+                # but wrong and (3) no coding is specified, and the default
+                # 'utf8' fails to decode.
+                warnings.warn('{0}: {1}'.format(item_path, err), ImportWarning)
+            except LookupError as err:
+                # an invalid 'coding' statement specified in target source file
+                warnings.warn('{0}: {1}'.format(item_path, err), ImportWarning)
+
 
         if len(names) == max_possible:
             # don't continue on recursing, there's no point!

diff --git a/tests/test_autodetect.py b/tests/test_autodetect.py
@@ -1,6 +1,13 @@
+# coding: utf8
+import os
+import sys
+import codecs
+import warnings
+import tempfile
+import contextlib
 from unittest import TestCase
-from prospector.autodetect import find_from_imports
 
+from prospector.autodetect import find_from_imports, find_from_path
 
 class FindFromImportsTest(TestCase):
 
@@ -9,20 +16,147 @@ def _test(self, contents, *expected_names):
         self.assertEqual(set(expected_names), names)
 
     def test_simple_imports(self):
+        """ imports related to django are discovered. """
         self._test('from django.db import models', 'django')
         self._test('import django', 'django')
+
+    def test_compound_discovered(self):
+        """ compound imports containing several items of interest are discovered. """
         self._test('from django import db\nfrom celery import task', 'django', 'celery')
 
+    def test_nothing_of_interest(self):
+        """ imports containing nothing of interest return an empty set. """
+        self._test('import codecs')
+
     def test_multiple_imports(self):
+        """ django is discovered in 'from ...' multi-import statements. """
         self._test('from django.db import (models, \n'
                    '    some, other, stuff)', 'django')
 
     def test_indented_imports(self):
+        """ django is discovered from function-local import statements. """
         self._test('def lala(self):\n    from django.db import models\n    return models.Model', 'django')
-     
+
     def test_same_line_two_imports(self):
+        """ importing two modules of interest on same line are discovered. """
         self._test('import django, celery', 'django', 'celery')
 
+class ImportCodingsTest(TestCase):
+
+    def SetUp(self):
+        warnings.simplefilter(action='always', category=ImportWarning)
+        TestCase.setUp(self)
+
+    def tearDown(self):
+        warnings.resetwarnings()
+        TestCase.tearDown(self)
+
+    if sys.version_info < (2,7):
+        # backport assertGreaterEqual and assertIn for python2.6.
+
+        def assertGreaterEqual(self, a, b, msg=None):
+            """Just like self.assertTrue(a >= b), but with a nicer default message."""
+            if not a >= b:
+                standardMsg = '%s not greater than or equal to %s' % (safe_repr(a), safe_repr(b))
+                self.fail(self._formatMessage(msg, standardMsg))
+
+        def assertIn(self, member, container, msg=None):
+            """Just like self.assertTrue(a in b), but with a nicer default message."""
+            if member not in container:
+                standardMsg = '%s not found in %s' % (safe_repr(member),
+                                                      safe_repr(container))
+                self.fail(self._formatMessage(msg, standardMsg))
+
+
+    @contextlib.contextmanager
+    def make_pypath(self, bindata):
+        tmp_folder = tempfile.mkdtemp(prefix='prospector-')
+        tmp_fp = tempfile.NamedTemporaryFile(suffix='.py', dir=tmp_folder, delete=False)
+        tmp_file = tmp_fp.name
+        tmp_fp.write(bindata)
+        tmp_fp.close()
+        try:
+            yield tmp_folder
+        finally:
+            os.unlink(tmp_file)
+            os.rmdir(tmp_folder)
+
+
+    def test_latin1_coding_line2(self):
+        """
+        File containing latin1 at line 2 with 'coding' declaration at line 1.
+        """
+        # given,
+        bindata = (u'# coding: latin1\n'
+                   u'# latin1 character: ®\n'
+                   u'import django\n'
+                   ).encode('latin1')
+        expected_names = ('django',)
+        with self.make_pypath(bindata=bindata) as path:
+            # exercise,
+            names = find_from_path(path=path)
+        # verify.
+        self.assertEqual(set(expected_names), names)
+
+
+    def test_negative_latin1(self):
+        """ Negative test: file containing latin1 without 'coding' declaration. """
+        # given,
+        bindata = u'# latin1 character: ®'.encode('latin1')
+
+        with self.make_pypath(bindata=bindata) as path:
+            # exercise,
+            with warnings.catch_warnings(record=True) as warned:
+                find_from_path(path=path)
+                # verify.
+                self.assertGreaterEqual(len(warned), 1)
+                self.assertIn(ImportWarning, [_w.category for _w in warned])
+
+
+    def test_negative_lookuperror(self):
+        """ File declares an unknown coding.  """
+        # given,
+        bindata = u'# coding: unknown\n'.encode('ascii')
+
+        with self.make_pypath(bindata=bindata) as path:
+            # exercise,
+            with warnings.catch_warnings(record=True) as warned:
+                find_from_path(path=path)
+                # verify.
+                self.assertGreaterEqual(len(warned), 1)
+                self.assertIn(ImportWarning, [_w.category for _w in warned])
+
+
+    def test_bom_encoded_filepath(self):
+        """ File containing only a UTF32_BE byte order mark still decodes.  """
+        # given,
+        bindata = codecs.BOM_UTF32_BE
+        bindata += (u'import django\n'
+                    u'# UTF-32-BE character: ®\n'
+                    ).encode('UTF-32BE')
+        expected_names = ('django',)
+        with self.make_pypath(bindata=bindata) as path:
+            # exercise,
+            names = find_from_path(path=path)
+        # verify.
+        self.assertEqual(set(expected_names), names)
+
+
+    def test_negative_misleading_bom(self):
+        """ Negative test: file containing BOM that is not the correct encoding. """
+        # given,
+        bindata = codecs.BOM_UTF32_BE
+        bindata += u'# latin1 character: ®'.encode('latin1')
+
+        with self.make_pypath(bindata=bindata) as path:
+            # exercise,
+            with warnings.catch_warnings(record=True) as warned:
+                find_from_path(path=path)
+                # verify.
+                self.assertGreaterEqual(len(warned), 1)
+                self.assertIn(ImportWarning, [_w.category for _w in warned])
+
+
 if __name__ == '__main__':
     import unittest
     unittest.main()