Skip to content

Commit

Permalink
Merge with #532, fix unicode filenames with escapesurogates
Browse files Browse the repository at this point in the history
  • Loading branch information
ankostis committed Oct 16, 2016
2 parents b2efa1b + 9e4a454 commit ec731f4
Show file tree
Hide file tree
Showing 7 changed files with 209 additions and 18 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.0.9dev0
2.0.10dev0
192 changes: 191 additions & 1 deletion git/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import locale
import os
import sys
import codecs


from gitdb.utils.compat import (
xrange,
Expand Down Expand Up @@ -67,7 +69,7 @@ def safe_decode(s):
if isinstance(s, unicode):
return s
elif isinstance(s, bytes):
return s.decode(defenc, 'replace')
return s.decode(defenc, 'surrogateescape')
elif s is not None:
raise TypeError('Expected bytes or text, but got %r' % (s,))

Expand Down Expand Up @@ -121,3 +123,191 @@ def __str__(self):
else: # Python 2
def __str__(self):
return self.__unicode__().encode(defenc)


"""
This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error
handler of Python 3.
Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc
"""

# This code is released under the Python license and the BSD 2-clause license


FS_ERRORS = 'surrogateescape'

# # -- Python 2/3 compatibility -------------------------------------
# FS_ERRORS = 'my_surrogateescape'

def u(text):
if PY3:
return text
else:
return text.decode('unicode_escape')

def b(data):
if PY3:
return data.encode('latin1')
else:
return data

if PY3:
_unichr = chr
bytes_chr = lambda code: bytes((code,))
else:
_unichr = unichr
bytes_chr = chr

def surrogateescape_handler(exc):
"""
Pure Python implementation of the PEP 383: the "surrogateescape" error
handler of Python 3. Undecodable bytes will be replaced by a Unicode
character U+DCxx on decoding, and these are translated into the
original bytes on encoding.
"""
mystring = exc.object[exc.start:exc.end]

try:
if isinstance(exc, UnicodeDecodeError):
# mystring is a byte-string in this case
decoded = replace_surrogate_decode(mystring)
elif isinstance(exc, UnicodeEncodeError):
# In the case of u'\udcc3'.encode('ascii',
# 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an
# exception anyway after this function is called, even though I think
# it's doing what it should. It seems that the strict encoder is called
# to encode the unicode string that this function returns ...
decoded = replace_surrogate_encode(mystring)
else:
raise exc
except NotASurrogateError:
raise exc
return (decoded, exc.end)


class NotASurrogateError(Exception):
pass


def replace_surrogate_encode(mystring):
"""
Returns a (unicode) string, not the more logical bytes, because the codecs
register_error functionality expects this.
"""
decoded = []
for ch in mystring:
# if PY3:
# code = ch
# else:
code = ord(ch)

# The following magic comes from Py3.3's Python/codecs.c file:
if not 0xD800 <= code <= 0xDCFF:
# Not a surrogate. Fail with the original exception.
raise exc
# mybytes = [0xe0 | (code >> 12),
# 0x80 | ((code >> 6) & 0x3f),
# 0x80 | (code & 0x3f)]
# Is this a good idea?
if 0xDC00 <= code <= 0xDC7F:
decoded.append(_unichr(code - 0xDC00))
elif code <= 0xDCFF:
decoded.append(_unichr(code - 0xDC00))
else:
raise NotASurrogateError
return str().join(decoded)


def replace_surrogate_decode(mybytes):
"""
Returns a (unicode) string
"""
decoded = []
for ch in mybytes:
# We may be parsing newbytes (in which case ch is an int) or a native
# str on Py2
if isinstance(ch, int):
code = ch
else:
code = ord(ch)
if 0x80 <= code <= 0xFF:
decoded.append(_unichr(0xDC00 + code))
elif code <= 0x7F:
decoded.append(_unichr(code))
else:
# # It may be a bad byte
# # Try swallowing it.
# continue
# print("RAISE!")
raise NotASurrogateError
return str().join(decoded)


def encodefilename(fn):
if FS_ENCODING == 'ascii':
# ASCII encoder of Python 2 expects that the error handler returns a
# Unicode string encodable to ASCII, whereas our surrogateescape error
# handler has to return bytes in 0x80-0xFF range.
encoded = []
for index, ch in enumerate(fn):
code = ord(ch)
if code < 128:
ch = bytes_chr(code)
elif 0xDC80 <= code <= 0xDCFF:
ch = bytes_chr(code - 0xDC00)
else:
raise UnicodeEncodeError(FS_ENCODING,
fn, index, index+1,
'ordinal not in range(128)')
encoded.append(ch)
return bytes().join(encoded)
elif FS_ENCODING == 'utf-8':
# UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF
# doesn't go through our error handler
encoded = []
for index, ch in enumerate(fn):
code = ord(ch)
if 0xD800 <= code <= 0xDFFF:
if 0xDC80 <= code <= 0xDCFF:
ch = bytes_chr(code - 0xDC00)
encoded.append(ch)
else:
raise UnicodeEncodeError(
FS_ENCODING,
fn, index, index+1, 'surrogates not allowed')
else:
ch_utf8 = ch.encode('utf-8')
encoded.append(ch_utf8)
return bytes().join(encoded)
else:
return fn.encode(FS_ENCODING, FS_ERRORS)

def decodefilename(fn):
return fn.decode(FS_ENCODING, FS_ERRORS)

FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')
# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')


# normalize the filesystem encoding name.
# For example, we expect "utf-8", not "UTF8".
FS_ENCODING = codecs.lookup(FS_ENCODING).name


def register_surrogateescape():
"""
Registers the surrogateescape error handler on Python 2 (only)
"""
if PY3:
return
try:
codecs.lookup_error(FS_ERRORS)
except LookupError:
codecs.register_error(FS_ERRORS, surrogateescape_handler)


try:
b"100644 \x9f\0aaa".decode(defenc, "surrogateescape")
except:
register_surrogateescape()
2 changes: 1 addition & 1 deletion git/ext/gitdb
7 changes: 2 additions & 5 deletions git/objects/fun.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from stat import S_ISDIR
from git.compat import (
byte_ord,
safe_decode,
defenc,
xrange,
text_type,
Expand Down Expand Up @@ -76,11 +77,7 @@ def tree_entries_from_data(data):
# default encoding for strings in git is utf8
# Only use the respective unicode object if the byte stream was encoded
name = data[ns:i]
try:
name = name.decode(defenc)
except UnicodeDecodeError:
pass
# END handle encoding
name = safe_decode(name)

# byte is NULL, get next 20
i += 1
Expand Down
2 changes: 1 addition & 1 deletion git/test/performance/test_commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def test_iteration(self):
# END for each object
# END for each commit
elapsed_time = time() - st
print("Traversed %i Trees and a total of %i unchached objects in %s [s] ( %f objs/s )"
print("Traversed %i Trees and a total of %i uncached objects in %s [s] ( %f objs/s )"
% (nc, no, elapsed_time, no / elapsed_time), file=sys.stderr)

def test_commit_traversal(self):
Expand Down
18 changes: 11 additions & 7 deletions git/test/test_fun.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
from io import BytesIO
from stat import (
S_IFDIR,
S_IFREG,
S_IFLNK
)
from stat import S_IFDIR, S_IFREG, S_IFLNK
from unittest.case import skipIf

from git.compat import PY3
from git.index import IndexFile
from git.index.fun import (
aggressive_tree_merge
Expand Down Expand Up @@ -253,6 +251,12 @@ def test_tree_traversal_single(self):
assert entries
# END for each commit

def test_tree_entries_from_data_with_failing_name_decode(self):
@skipIf(PY3, 'odd types returned ... maybe figure it out one day')
def test_tree_entries_from_data_with_failing_name_decode_py2(self):
r = tree_entries_from_data(b'100644 \x9f\0aaa')
assert r == [('aaa', 33188, u'\udc9f')], r

@skipIf(not PY3, 'odd types returned ... maybe figure it out one day')
def test_tree_entries_from_data_with_failing_name_decode_py3(self):
r = tree_entries_from_data(b'100644 \x9f\0aaa')
assert r == [(b'aaa', 33188, b'\x9f')], r
assert r == [(b'aaa', 33188, '\udc9f')], r
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def _stamp_version(filename):
else:
print("WARNING: Couldn't find version line in file %s" % filename, file=sys.stderr)

install_requires = ['gitdb >= 0.6.4']
install_requires = ['gitdb2 >= 2.0.0']
extras_require = {
':python_version == "2.6"': ['ordereddict'],
}
Expand Down Expand Up @@ -100,7 +100,7 @@ def _stamp_version(filename):
package_data={'git.test': ['fixtures/*']},
package_dir={'git': 'git'},
license="BSD License",
requires=['gitdb (>=0.6.4)'],
requires=['gitdb2 (>=2.0.0)'],
install_requires=install_requires,
test_requirements=test_requires + install_requires,
zip_safe=False,
Expand Down

0 comments on commit ec731f4

Please sign in to comment.