Skip to content

Commit

Permalink
Merge pull request #71 from jquast/resolve-unicodedecodeerror-issue-65
Browse files Browse the repository at this point in the history
Detect target file encoding in autodetect.py
  • Loading branch information
carlio committed Dec 5, 2014
2 parents 83e8082 + 81d4624 commit f7012a9
Show file tree
Hide file tree
Showing 4 changed files with 203 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Prospector Changelog
* File names ending in 'tests.py' will now be ignored if prospector is set to ignore tests (previously, the regular expression only ignored files ending in 'test.py')
* [#70](https://github.com/landscapeio/prospector/issues/70) Profiles starting with a `.yml` extension can now be autoloaded
* [#62](https://github.com/landscapeio/prospector/issues/62) For human readable output, the summary of messages will now be printed at the end rather than at the start, so the summary will be what users see when running prospector (without piping into `less` etc)
* [#65](https://github.com/landscapeio/prospector/issues/65) Resolve UnicodeDecodeErrors thrown while attempting to auto-discover modules of interest by discovering target python source file encoding (PEP263), and issuing only a warning if it fails (thanks to [Jeff Quast](https://github.com/jquast)).

## Version 0.7.2

Expand Down
4 changes: 3 additions & 1 deletion CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@ Contributors

* Carl Crowder ([@carlio](https://github.com/carlio))

* Jason Simeone ([@jayclassless](https://github.com/jayclassless))
* Jason Simeone ([@jayclassless](https://github.com/jayclassless))

* Jeff Quast ([@jquast](https://github.com/jquast))
65 changes: 63 additions & 2 deletions prospector/autodetect.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import codecs
import warnings
import os
import re
from prospector.adaptor import LIBRARY_ADAPTORS
Expand All @@ -9,6 +11,52 @@
_FROM_IMPORT_REGEX = re.compile(r'^\s*from ([\._a-zA-Z0-9]+) import .*$')
_IMPORT_REGEX = re.compile(r'^\s*import ([\._a-zA-Z0-9]+)$')
_IMPORT_MULTIPLE_REGEX = re.compile(r'^\s*import ([\._a-zA-Z0-9]+(, ){1})+')
_CODING_REGEX = "coding[:=]\s*([-\w.]+)"


def detect_by_bom(path):
"""
Determine and return file encoding using BOM character.
This is necessary for files such as setuptools' tests/script-with-bom.py
:returns bom-detected encoding only if discovered
"""
with open(path, 'rb') as fin:
raw = fin.read(4)
for enc, boms in (
('utf-8-sig', (codecs.BOM_UTF8,)),
('utf-16', (codecs.BOM_UTF16_LE,codecs.BOM_UTF16_BE)),
('utf-32', (codecs.BOM_UTF32_LE,codecs.BOM_UTF32_BE))):
if any(raw.startswith(bom) for bom in boms):
return enc


def determine_pyfile_encoding(path, default='utf8'):
"""
Determine and return file encoding of a python source file.
https://www.python.org/dev/peps/pep-0263/
:returns: file encoding if determined, otherwise ``default``.
"""
encoding = detect_by_bom(path)
if encoding:
# trust the byte-order mark
return encoding

with open(path, 'rb') as fip:
# look for string of form '# coding: <encoding>'
# in the first two lines of python source file.
for line in (_line for _line in
(fip.readline(), fip.readline())
if _line.decode(default).startswith('#')):
match = next(re.finditer(_CODING_REGEX, line.decode('ascii')), None)
if match:
return match.group(1)
return default


def find_from_imports(file_contents):
names = set()
for line in file_contents.split('\n'):
Expand Down Expand Up @@ -41,8 +89,21 @@ def find_from_path(path):
if os.path.isdir(item_path):
names |= find_from_path(item_path)
elif not os.path.islink(item_path) and item_path.endswith('.py'):
with open(item_path) as fip:
names |= find_from_imports(fip.read())
try:
encoding = determine_pyfile_encoding(item_path, default='utf8')
with codecs.open(item_path, encoding=encoding) as fip:
names |= find_from_imports(fip.read())
except UnicodeDecodeError as err:
# this warning is issued: (1) in determine_pyfile_encoding for
# badly authored files (contains non-utf8 in a comment line), or
# in find_from_imports() for either (2) a coding is specified,
# but wrong and (3) no coding is specified, and the default
# 'utf8' fails to decode.
warnings.warn('{0}: {1}'.format(item_path, err), ImportWarning)
except LookupError as err:
# an invalid 'coding' statement specified in target source file
warnings.warn('{0}: {1}'.format(item_path, err), ImportWarning)


if len(names) == max_possible:
# don't continue on recursing, there's no point!
Expand Down
138 changes: 136 additions & 2 deletions tests/test_autodetect.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
# coding: utf8
import os
import sys
import codecs
import warnings
import tempfile
import contextlib
from unittest import TestCase
from prospector.autodetect import find_from_imports

from prospector.autodetect import find_from_imports, find_from_path

class FindFromImportsTest(TestCase):

Expand All @@ -9,20 +16,147 @@ def _test(self, contents, *expected_names):
self.assertEqual(set(expected_names), names)

def test_simple_imports(self):
""" imports related to django are discovered. """
self._test('from django.db import models', 'django')
self._test('import django', 'django')

def test_compound_discovered(self):
""" compound imports containing several items of interest are discovered. """
self._test('from django import db\nfrom celery import task', 'django', 'celery')

def test_nothing_of_interest(self):
""" imports containing nothing of interest return an empty set. """
self._test('import codecs')

def test_multiple_imports(self):
""" django is discovered in 'from ...' multi-import statements. """
self._test('from django.db import (models, \n'
' some, other, stuff)', 'django')

def test_indented_imports(self):
""" django is discovered from function-local import statements. """
self._test('def lala(self):\n from django.db import models\n return models.Model', 'django')

def test_same_line_two_imports(self):
""" importing two modules of interest on same line are discovered. """
self._test('import django, celery', 'django', 'celery')

class ImportCodingsTest(TestCase):

def SetUp(self):
warnings.simplefilter(action='always', category=ImportWarning)
TestCase.setUp(self)

def tearDown(self):
warnings.resetwarnings()
TestCase.tearDown(self)

if sys.version_info < (2,7):
# backport assertGreaterEqual and assertIn for python2.6.

def assertGreaterEqual(self, a, b, msg=None):
"""Just like self.assertTrue(a >= b), but with a nicer default message."""
if not a >= b:
standardMsg = '%s not greater than or equal to %s' % (safe_repr(a), safe_repr(b))
self.fail(self._formatMessage(msg, standardMsg))

def assertIn(self, member, container, msg=None):
"""Just like self.assertTrue(a in b), but with a nicer default message."""
if member not in container:
standardMsg = '%s not found in %s' % (safe_repr(member),
safe_repr(container))
self.fail(self._formatMessage(msg, standardMsg))


@contextlib.contextmanager
def make_pypath(self, bindata):
tmp_folder = tempfile.mkdtemp(prefix='prospector-')
tmp_fp = tempfile.NamedTemporaryFile(suffix='.py', dir=tmp_folder, delete=False)
tmp_file = tmp_fp.name
tmp_fp.write(bindata)
tmp_fp.close()
try:
yield tmp_folder
finally:
os.unlink(tmp_file)
os.rmdir(tmp_folder)


def test_latin1_coding_line2(self):
"""
File containing latin1 at line 2 with 'coding' declaration at line 1.
"""
# given,
bindata = (u'# coding: latin1\n'
u'# latin1 character: ®\n'
u'import django\n'
).encode('latin1')
expected_names = ('django',)
with self.make_pypath(bindata=bindata) as path:
# exercise,
names = find_from_path(path=path)
# verify.
self.assertEqual(set(expected_names), names)


def test_negative_latin1(self):
""" Negative test: file containing latin1 without 'coding' declaration. """
# given,
bindata = u'# latin1 character: ®'.encode('latin1')

with self.make_pypath(bindata=bindata) as path:
# exercise,
with warnings.catch_warnings(record=True) as warned:
find_from_path(path=path)
# verify.
self.assertGreaterEqual(len(warned), 1)
self.assertIn(ImportWarning, [_w.category for _w in warned])


def test_negative_lookuperror(self):
""" File declares an unknown coding. """
# given,
bindata = u'# coding: unknown\n'.encode('ascii')

with self.make_pypath(bindata=bindata) as path:
# exercise,
with warnings.catch_warnings(record=True) as warned:
find_from_path(path=path)
# verify.
self.assertGreaterEqual(len(warned), 1)
self.assertIn(ImportWarning, [_w.category for _w in warned])


def test_bom_encoded_filepath(self):
""" File containing only a UTF32_BE byte order mark still decodes. """
# given,
bindata = codecs.BOM_UTF32_BE
bindata += (u'import django\n'
u'# UTF-32-BE character: ®\n'
).encode('UTF-32BE')
expected_names = ('django',)
with self.make_pypath(bindata=bindata) as path:
# exercise,
names = find_from_path(path=path)
# verify.
self.assertEqual(set(expected_names), names)


def test_negative_misleading_bom(self):
""" Negative test: file containing BOM that is not the correct encoding. """
# given,
bindata = codecs.BOM_UTF32_BE
bindata += u'# latin1 character: ®'.encode('latin1')

with self.make_pypath(bindata=bindata) as path:
# exercise,
with warnings.catch_warnings(record=True) as warned:
find_from_path(path=path)
# verify.
self.assertGreaterEqual(len(warned), 1)
self.assertIn(ImportWarning, [_w.category for _w in warned])


if __name__ == '__main__':
import unittest
unittest.main()

0 comments on commit f7012a9

Please sign in to comment.